## Notebook Magic

In [1]:
%matplotlib inline
%load_ext autoreload

## Imports

In [2]:
import os
import yaml
import pandas as pd
from sqlalchemy import *
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from iefp import data
from iefp import utils
from iefp import processing

def preview_values(df):
    for j in range(len(df.columns)):
        values = []
        for k in range(len(df)):
            df.iloc[k:k+1, j:j+1].isnull().values.any()
            if not df.iloc[k:k+1, j:j+1].isnull().values.any():
                values.append(
                    str(df.iloc[k:k+1, j:j+1].values[0][0]))
        print(
            str(df.columns[j]) + ': '
            + ', '.join(values[0:5]) + '\n')

## SQL Connection

In [3]:
pg_cred = yaml.load(open("../conf/local/credentials.yml"), Loader=yaml.FullLoader)

In [4]:
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(pg_cred["pg_user"], pg_cred["pg_pass"], pg_cred["pg_host"], 5432, "iefp")
con = create_engine(url, client_encoding='utf8')
meta = MetaData(bind=con, reflect=True)

  after removing the cwd from sys.path.


## Get Pedidos Table

In [5]:
ped_cols = yaml.load(open("../conf/base/sigae_columns.yml"), Loader=yaml.FullLoader)["pedidos"]
table = 'pedidos'

## Extract slim pedidos table

In [6]:
%%time
sql = """
select {}
from {}
order by "ano_mes" desc
limit 10000
""".format(', '.join(ped_cols), "pedidos")

df = pd.read_sql(sql, con)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 32 columns):
ute_id                   10000 non-null float64
tipo_movimento           10000 non-null float64
data_movimento           10000 non-null datetime64[ns]
motivo_inscricao         10000 non-null float64
motivo_anulacao          2224 non-null float64
motivo_invalidacao       27 non-null object
categoria                10000 non-null float64
rinsc                    10000 non-null object
data_nascimento          10000 non-null datetime64[ns]
sexo                     10000 non-null object
deficiencia              10000 non-null float64
nr_pessoas_cargo         7073 non-null float64
formacao_profissional    3054 non-null object
carteira_profissional    2749 non-null object
habilitacao              10000 non-null object
areas_curso              1314 non-null object
estado_civil             10000 non-null object
origem_registo_utente    10000 non-null object
nacionalidade            10000 

# Cleaning

In [7]:
def clean(dataframe, bool_list, null_list, cat_list, date_list):
    # Replace all None types with Pandas NaNs
    for null_val in null_list:
        dataframe.replace(to_replace=null_val, value=np.nan, inplace=True)
        
    # Convert all appropriate column datatypes to int
    df_float = dataframe.select_dtypes(exclude=['datetime'])
    df_int = df_float.apply(pd.to_numeric, errors='ignore', downcast='integer')
    dataframe[df_int.columns] = df_int[df_int.columns]
    
    # Category convert
    dataframe[cat_list] = dataframe[cat_list].astype('category')
    
    # Boolean convert
    dataframe = processing.bool_convert(dataframe, bool_list)
    
    # Strip time from datetime columns
    dataframe = processing.strip_time(dataframe, date_list)
    
    # Remove duplicates
    dataframe = dataframe.drop_duplicates()
    return dataframe

In [8]:
#preview_values(df)

In [9]:
null_list = [[None], '  ']
bool_list = ["rinsc", "formacao_profissional", "carteira_profissional", "rsi"]
cat_list = ['sexo', 'estado_civil', 'origem_registo_utente', 'nacionalidade', \
            'pais_emigracao', 'segmento', 'local_trabalho', 'a_tempo', 'natureza_emprego', 'habilitacao']
date_list = ['data_movimento', 'data_nascimento']

In [10]:
%%time
cleaned_df = clean(df, bool_list, null_list, cat_list, date_list)
cleaned_df.head()

CPU times: user 284 ms, sys: 3.79 ms, total: 288 ms
Wall time: 287 ms


Unnamed: 0,ute_id,tipo_movimento,data_movimento,motivo_inscricao,motivo_anulacao,motivo_invalidacao,categoria,rinsc,data_nascimento,sexo,...,pais_emigracao,subsidio,rsi,ucpp,tempo_pratica,segmento,cpp_pretendida,local_trabalho,a_tempo,natureza_emprego
0,6783103,21,2019-05-23,8,,,2,False,1996-07-27,M,...,,24.0,False,93290.0,24.0,RB,93290.0,,C,I
1,672809,94,2019-05-23,7,,,2,True,1972-02-15,F,...,,8.0,True,96213.0,300.0,RE,96213.0,C,C,P
2,6477453,43,2019-05-23,12,,,2,True,1993-07-18,F,...,,,False,,0.0,RB,26340.0,,C,P
3,2616480,11,2019-05-23,12,,,2,True,1959-06-25,M,...,,26.0,True,52230.0,60.0,RM,52230.0,C,C,P
4,6474193,21,2019-05-23,8,,,2,True,1994-03-04,M,...,,24.0,False,93290.0,30.0,RB,93290.0,,C,I


In [11]:
cleaned_df.dtypes

ute_id                            int32
tipo_movimento                     int8
data_movimento           datetime64[ns]
motivo_inscricao                   int8
motivo_anulacao                 float64
motivo_invalidacao              float64
categoria                          int8
rinsc                              bool
data_nascimento          datetime64[ns]
sexo                           category
deficiencia                        int8
nr_pessoas_cargo                float64
formacao_profissional              bool
carteira_profissional              bool
habilitacao                    category
areas_curso                      object
estado_civil                   category
origem_registo_utente          category
nacionalidade                  category
freguesia                        object
centro                            int16
centroa                           int16
pais_emigracao                 category
subsidio                        float64
rsi                                bool


In [12]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9966 entries, 0 to 9999
Data columns (total 32 columns):
ute_id                   9966 non-null int32
tipo_movimento           9966 non-null int8
data_movimento           9966 non-null datetime64[ns]
motivo_inscricao         9966 non-null int8
motivo_anulacao          2224 non-null float64
motivo_invalidacao       27 non-null float64
categoria                9966 non-null int8
rinsc                    9966 non-null bool
data_nascimento          9966 non-null datetime64[ns]
sexo                     9966 non-null category
deficiencia              9966 non-null int8
nr_pessoas_cargo         7042 non-null float64
formacao_profissional    9966 non-null bool
carteira_profissional    9966 non-null bool
habilitacao              9966 non-null category
areas_curso              1312 non-null object
estado_civil             9966 non-null category
origem_registo_utente    9966 non-null category
nacionalidade            9966 non-null category
fregues

In [13]:
#preview_values(cleaned_df)