## Imports

In [16]:
import configparser
import pandas as pd
import numpy as np
import boto3
import s3fs
import os
from tqdm import tqdm
from time import time
from sqlalchemy import create_engine

In [17]:
def stage_global_temperatures():
    '''
    Staging of temperature data into temperature_staging table
    '''
    table = "temperature_staging"
    
    config = configparser.ConfigParser()
    config.read('../config.cfg')
    
    s3 = boto3.client(
        's3',
        region_name='eu-central-1',
        aws_access_key_id=config['AWS']['KEY'],
        aws_secret_access_key=config['AWS']['SECRET']
    )
    
    s3_path = s3.get_object(Bucket='world-development', Key='input_data/GlobalLandTemperaturesByCountry.csv')['Body']
    
    db_prop = config['POSTGRESQL']
    user = db_prop['username']
    password = db_prop['password']
    dbname = db_prop['dbname']

    engine_connection_string = f'postgresql://{user}:{password}@127.0.0.1:5432/{dbname}'
    engine = create_engine(engine_connection_string)


    df = pd.read_csv(s3_path)
    
    print(f"Raw row count: {len(df)}.")

    check_cols_for_nan = ['dt', 'AverageTemperature', 'Country']
    print(f"Drop rows with nans in columns: {check_cols_for_nan}.")

    df_no_nans = df.dropna(subset=check_cols_for_nan)

    print(f"Count of non-nan rows: {len(df)}")

    # remove duplicate countries, remove regions that have been colonies in the past

    df_no_dup = df_no_nans[~df_no_nans['Country'].isin(
        ['Denmark', 'Antarctica', 'France', 'Europe', 'Netherlands',
         'United Kingdom', 'Africa', 'South America'])]

    df_no_dup = df_no_dup.replace(
        ['Denmark (Europe)', 'France (Europe)', 'Netherlands (Europe)', 'United Kingdom (Europe)'],
        ['Denmark', 'France', 'Netherlands', 'United Kingdom'])

    # add year column and change its type to int

    df_no_dup['year'] = df_no_dup['dt'].str[:4]
    df_no_dup_with_year = df_no_dup.astype({'year': 'int32'})

    # aggregate (mean) temp and uncertainty by year and country

    surface_temp_by_year_and_country = df_no_dup_with_year \
                                            .groupby(['year', 'Country'],as_index=False) \
                                            .agg({ 'AverageTemperature': 'mean', \
                                                   'AverageTemperatureUncertainty': 'mean' \
                                                 })

    surface_temp_by_year_and_country = surface_temp_by_year_and_country.rename(
        columns={"Country": "country_or_area", "AverageTemperature": "temperature",
                 "AverageTemperatureUncertainty": "uncertainty"})

    surface_temp_by_year_and_country.sample(5)
    #surface_temp_by_year_and_country.to_sql(table, engine, index=False, if_exists="replace")

In [18]:
stage_global_temperatures()

Raw row count: 577462.
Drop rows with nans in columns: ['dt', 'AverageTemperature', 'Country'].
Count of non-nan rows: 577462
