## Notebook Magic

In [None]:
%matplotlib inline
%load_ext autoreload

## Imports

In [None]:
import os
import yaml
import pandas as pd
from sqlalchemy import *
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from iefp.processing.cleaning import *

'''
from iefp import data
from iefp import utils
def preview_values(df):
    for j in range(len(df.columns)):
        values = []
        for k in range(len(df)):
            df.iloc[k:k+1, j:j+1].isnull().values.any()
            if not df.iloc[k:k+1, j:j+1].isnull().values.any():
                values.append(
                    str(df.iloc[k:k+1, j:j+1].values[0][0]))
        print(
            str(df.columns[j]) + ': '
            + ', '.join(values[0:5]) + '\n')
'''

## SQL Connection

In [None]:
pg_cred = yaml.load(open("../conf/local/credentials.yml"), Loader=yaml.FullLoader)

In [None]:
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(pg_cred['db']["pg_user"], pg_cred['db']["pg_pass"], pg_cred['db']["pg_host"], 5432, "iefp")
con = create_engine(url, client_encoding='utf8')
meta = MetaData(bind=con, reflect=True)

## Get Pedidos Table

In [None]:
ped_cols = yaml.load(open("../conf/base/sigae_columns.yml"), Loader=yaml.FullLoader)["pedidos"]
table = 'pedidos'

## Extract slim pedidos table

In [None]:
%%time
sql = """
select {}
from {}
order by "ano_mes" desc
limit 10000
""".format(', '.join(ped_cols), "pedidos")

df = pd.read_sql(sql, con)
df.info()

# Cleaning

In [None]:
def clean(dataframe, bool_list, null_list, cat_list, date_list):
    # Replace all None types with Pandas NaNs
    for null_val in null_list:
        dataframe.replace(to_replace=null_val, value=np.nan, inplace=True)
        
    # Convert all appropriate column datatypes to int
    df_float = dataframe.select_dtypes(exclude=['datetime'])
    df_int = df_float.apply(pd.to_numeric, errors='ignore', downcast='integer')
    dataframe[df_int.columns] = df_int[df_int.columns]
    
    # Category convert
    dataframe[cat_list] = dataframe[cat_list].astype('category')
    
    # Boolean convert
    dataframe = processing.bool_convert(dataframe, bool_list)
    
    # Strip time from datetime columns
    dataframe = processing.strip_time(dataframe, date_list)
    
    # Remove duplicates
    dataframe = dataframe.drop_duplicates()
    return dataframe

In [None]:
#preview_values(df)

In [None]:
null_list = [[None], '  ']
bool_list = ["rinsc", "formacao_profissional", "carteira_profissional", "rsi"]
cat_list = ['sexo', 'estado_civil', 'origem_registo_utente', 'nacionalidade', \
            'pais_emigracao', 'segmento', 'local_trabalho', 'a_tempo', 'natureza_emprego', 'habilitacao']
date_list = ['data_movimento', 'data_nascimento']

In [None]:
%%time
cleaned_df = clean(df, bool_list, null_list, cat_list, date_list)
cleaned_df.head()

In [None]:
cleaned_df.dtypes

In [None]:
cleaned_df.info()

In [None]:
#preview_values(cleaned_df)

In [None]:
df_clean = pd.read_parquet("s3://iefp-unemployment/intermediate/clean/pedidos.parquet")

In [None]:
df_inter = pd.read_parquet("s3://iefp-unemployment/intermediate/filter/pedidos.parquet")

In [None]:
df_clean.info()
df_clean.head(30)