## Imports

In [74]:
import configparser
import pandas as pd
import numpy as np
import boto3
import s3fs
import os
from tqdm import tqdm
from time import time
from sqlalchemy import create_engine

In [75]:
def stage_global_temperatures():
    '''
    Staging of temperature data into temperature_staging table
    '''
    table = "temperature_staging"
    
    config = configparser.ConfigParser()
    config.read('../config.cfg')
    
    #s3 = boto3.client(
    #    's3',
    #    region_name='eu-central-1',
    #    aws_access_key_id=config['AWS']['KEY'],
    #    aws_secret_access_key=config['AWS']['SECRET']
    #)
    # 
    #s3_path = s3.get_object(Bucket='world-development', Key='input_data/GlobalLandTemperaturesByCountry.csv')['Body']
    
    s3_path = '../data/GlobalLandTemperaturesByCountry.csv'
    
    db_prop = config['POSTGRESQL']
    user = db_prop['username']
    password = db_prop['password']
    dbname = db_prop['dbname']

    engine_connection_string = f'postgresql://{user}:{password}@127.0.0.1:5432/{dbname}'
    engine = create_engine(engine_connection_string)


    df = pd.read_csv(s3_path)
    
    print(f"Raw row count: {len(df)}.")

    check_cols_for_nan = ['dt', 'AverageTemperature', 'Country']
    print(f"Drop rows with nans in columns: {check_cols_for_nan}.")

    df_no_nans = df.dropna(subset=check_cols_for_nan)

    print(f"Count of non-nan rows: {len(df)}")

    # remove duplicate countries, remove regions that have been colonies in the past

    df_no_dup = df_no_nans[~df_no_nans['Country'].isin(
        ['Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands',
         'United Kingdom', 'Africa', 'South America'])]

    df_no_dup = df_no_dup.replace(
        ['Denmark (Europe)', 'France (Europe)', 'Netherlands (Europe)', 'United Kingdom (Europe)'],
        ['Denmark', 'France', 'Netherlands', 'United Kingdom'])

    # add year column and change its type to int

    df_no_dup['year'] = df_no_dup['dt'].str[:4]
    df_no_dup_with_year = df_no_dup.astype({'year': 'int32'})

    # aggregate (mean) temp and uncertainty by year and country

    df = df_no_dup_with_year \
                                            .groupby(['year', 'Country'],as_index=False) \
                                            .agg({ 'AverageTemperature': 'mean', \
                                                   'AverageTemperatureUncertainty': 'mean' \
                                                 })

    df = df.rename(
        columns={"Country": "country_or_area", "AverageTemperature": "temperature",
                 "AverageTemperatureUncertainty": "uncertainty"})

    df['rank'] = np.nan

    # add temperature rank for every year by temperature: 1: hottest -> coldest.

    years = df['year'].unique()

    for year in years:
        year_data = df[df['year'] == year]
        sorted_year_data = year_data.sort_values('temperature')
        sorted_year_data['rank'] = sorted_year_data['temperature'].rank(ascending=False)

        for i in sorted_year_data.index:
            rank = sorted_year_data.at[i, 'rank']
            df.at[i, 'rank'] = rank
    
    return df

df = stage_global_temperatures()

Raw row count: 577462.
Drop rows with nans in columns: ['dt', 'AverageTemperature', 'Country'].
Count of non-nan rows: 577462


In [76]:
df[df['year'] == 1991].head(100)

Unnamed: 0,year,country_or_area,temperature,uncertainty,rank
38929,1991,Afghanistan,14.370750,0.411000,162.0
38930,1991,Albania,12.338833,0.322500,170.0
38931,1991,Algeria,23.011083,0.361083,122.0
38932,1991,American Samoa,27.171167,0.312000,30.0
38933,1991,Andorra,11.565417,0.243250,174.0
...,...,...,...,...,...
39024,1991,Hong Kong,23.334750,0.242000,117.0
39025,1991,Hungary,9.680167,0.236583,190.0
39026,1991,Iceland,2.540083,0.316000,227.0
39027,1991,India,24.311750,0.185750,104.0
