# ETL Simple Pipeline

## Let us start by importing necessary packages

In [50]:
import pandas as pd
import numpy as np
import os

## Extract Data

In [51]:
# countries = pd.read_csv('Countries.csv')
countries

Unnamed: 0.1,Unnamed: 0,Countries,Population_per_k
0,0,Austria,9006.398
1,1,Bolivia,11673.021
2,2,China,1439323.776
3,3,Denmark,5792.202
4,4,Egypt,102334.404
5,5,Ethiopia,114963.588
6,6,Finland,5540.72
7,7,France,65273.511
8,8,Germany,83783.942
9,9,Greece,10423.054


In [52]:
countries_metadata = pd.read_csv('Countries_metadata.csv')
countries_metadata

Unnamed: 0,country_names,Land_Area,Region
0,Greece,128900,Europe
1,China,9388211,Asia
2,Denmark,42430,Europe
3,Ethiopia,1000000,Africa
4,Egypt,995450,Africa
5,Bolivia,1083300,South America
6,Austria,82409,Europe
7,France,547557,Europe
8,Germany,348560,Europe
9,Finland,303890,Europe


## Transform

In [53]:
# countries['Population'] = countries['Population']/1000
# countries = countries.rename(columns={'Population':'Population_per_k'})
countries

Unnamed: 0.1,Unnamed: 0,Countries,Population_per_k
0,0,Austria,9006.398
1,1,Bolivia,11673.021
2,2,China,1439323.776
3,3,Denmark,5792.202
4,4,Egypt,102334.404
5,5,Ethiopia,114963.588
6,6,Finland,5540.72
7,7,France,65273.511
8,8,Germany,83783.942
9,9,Greece,10423.054


In [54]:
# countries_metadata['Land_Area'] = countries_metadata['Land_Area']/1000
# countries_metadata = countries_metadata.rename(columns={'Land_Area':'Land_Area_per_k'})
countries_metadata

Unnamed: 0,country_names,Land_Area,Region
0,Greece,128900,Europe
1,China,9388211,Asia
2,Denmark,42430,Europe
3,Ethiopia,1000000,Africa
4,Egypt,995450,Africa
5,Bolivia,1083300,South America
6,Austria,82409,Europe
7,France,547557,Europe
8,Germany,348560,Europe
9,Finland,303890,Europe


## Load Data

In [55]:
countries.to_csv('Countries.csv')
countries_metadata.to_csv('Countries_metadata.csv')

# Create `etl_pipeline.py`

In [56]:
class DataPreprocessor:
    def __init__(self, path_folder = os.getcwd()):  

        self.path_folder = path_folder
        
        # Path to input
        self.path_input_folder = "{}/".format(path_folder)
        self.path_input_countries = self.path_input_folder + 'Countries.csv'
        self.path_input_countries_metadata = self.path_input_folder + 'Countries_metadata.csv'
        print(self.path_input_folder)

        # Path on which output tables are saved
        self.path_output_folder = "{}/".format(path_folder)
        self.path_output_countries = self.path_output_folder + 'countries1.csv'
        self.path_output_countries_metadata = self.path_output_folder + 'countries_metadata1.csv'

        # create dictionaries for read dtypes
        self.read_dtypes_countries = {'Countries':'category'}
        self.read_dtypes_countries_metadata = {'country_names':'category'}

        # create folders for output if not existent yet
        if not os.path.exists(self.path_output_folder):
            os.makedirs(self.path_output_folder) 


    def read_data_from_raw_input(self):

        print("Start:\tRead in countries Dataset")
        self.countries = pd.read_csv(self.path_input_countries, dtype=self.read_dtypes_countries)
        print(self.countries)
        print("Finish:\tRead in countries Dataset")
        print()

        print("Start:\tRead in countries_metadata Dataset")       
        self.countries_metadata = pd.read_csv(self.path_input_countries_metadata, dtype=self.read_dtypes_countries_metadata)
        print(self.countries_metadata)
        print("Finish:\tRead in countries_metadata Dataset")
        print()

    def preprocess_data(self, save_preprocess_countries=True, save_preprocess_countries_metadata=True):

        print("Start:\tPreprocessing countries Dataset")
        self.preprocess_countries()
        print("Finish:\tPreprocessing countries Dataset")
        print()

        print("Start:\tPreprocessing countries_metadata Dataset")
        self.preprocess_countries_metadata()
        print("Finish:\tPreprocessing countries_metadata Dataset")
        print()

        if save_preprocess_countries:
            print("Start:\tSave countries Dataset to disc")
            self.countries.to_csv(self.path_output_countries, index=False)
            print("Finish:\tSave countries Dataset to disc")
            print()

        if save_preprocess_countries_metadata:
            print("Start:\tSave countries_metadata Dataset to disc")
            self.countries_metadata.to_csv(self.path_output_countries_metadata, index=False)
            print("Finish:\tSave countries_metadata Dataset to disc")
            print()

        return self.countries, self.countries_metadata


    def preprocess_countries(self):
        
        self.countries['Population'] = self.countries['Population']/1000
        self.countries = self.countries.rename(columns={'Population':'Population_per_k'})


    def preprocess_countries_metadata(self):
        
        self.countries_metadata['Land_Area'] = self.countries_metadata['Land_Area']/1000
        self.countries_metadata = self.countries_metadata.rename(columns={'Land_Area':'Land_Area_per_k'})


    def read_preprocessed_tables(self):
        
        print("Start:\tRead in modified countries Dataset")
        self.countries = pd.read_csv(self.path_output_countries, dtype=self.read_dtypes_countries)
        print("Finish:\tRead in modified countries Dataset")
        print()

        print("Start:\tRead in modified countries_metadata Dataset")       
        self.countries_metadata = pd.read_csv(self.path_output_countries_metadata, dtype=self.read_dtypes_countries_metadata)
        print("Finish:\tRead in modified countries_metadata Dataset")
        print()

        return self.countries, self.countries_metadata


def main():

    datapreprocesssor = DataPreprocessor()
    datapreprocesssor.read_data_from_raw_input()
    datapreprocesssor.preprocess_data()
    print('ETL has been successfully completed !!')

In [57]:
main()

C:\Users\User\Downloads/
Start:	Read in countries Dataset
   Unnamed: 0.1  Unnamed: 0 Countries  Population_per_k
0             0           0   Austria          9006.398
1             1           1   Bolivia         11673.021
2             2           2     China       1439323.776
3             3           3   Denmark          5792.202
4             4           4     Egypt        102334.404
5             5           5  Ethiopia        114963.588
6             6           6   Finland          5540.720
7             7           7    France         65273.511
8             8           8   Germany         83783.942
9             9           9    Greece         10423.054
Finish:	Read in countries Dataset

Start:	Read in countries_metadata Dataset
   Unnamed: 0 country_names  Land_Area         Region
0           0        Greece     128900         Europe
1           1         China    9388211           Asia
2           2       Denmark      42430         Europe
3           3      Ethiopia    10

KeyError: 'Population'