In [None]:
import yaml
import pandas as pd
from sqlalchemy import *
import numpy as np
import datetime
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

%matplotlib inline
%load_ext autoreload

In [None]:
pg_cred = yaml.load(open("../conf/local/credentials.yml"), Loader=yaml.FullLoader)


In [None]:
exit_map = yaml.load(open("../conf/base/exit_mappings.yml"), Loader=yaml.FullLoader)

In [None]:
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(pg_cred["pg_user"], pg_cred["pg_pass"], pg_cred["pg_host"], 5432, "iefp")

# The return value of create_engine() is our connection object
con = create_engine(url, client_encoding='utf8')

# We then bind the connection to MetaData()
meta = MetaData(bind=con, reflect=True)

In [None]:
%%time
target_table = Table('pedidos', meta, autoload=True, autoload_with=con)

# stmt = sqlalchemy.select(col_lst)

stmt = select([target_table.c.ano_mes, target_table.c.motivo_anulacao, func.count(target_table.c.motivo_anulacao)])

stmt = stmt.where(target_table.columns.tipo_movimento == 31)

stmt = stmt.group_by(target_table.columns.ano_mes, target_table.columns.motivo_anulacao)

stmt = stmt.order_by(target_table.columns.ano_mes, desc(func.count(target_table.c.motivo_anulacao)))

results = con.execute(stmt).fetchall()

df = pd.read_sql(stmt, con)
df.info()

In [None]:
months = df['ano_mes'].unique()

In [None]:
df.head(10)

# Exit reasons by Category

In [None]:
exit_col_names = ["MONTH", "EMIGRATION", "EMPLOY_PROG", "FAILURE", "INACTIVITY", "LAB_MARK_INT", "OTHER", "REFUSALS", "VOCA_TRAIN_INT"]
rows = []
for month in months:
    df_month = df[df['ano_mes'] == month]
    reason_cat_month = df_month.iloc[:,2:4].groupby(['exit_cat']).sum()['count_1'].tolist()
    reason_cat_month.insert(0, month)
    rows.append(reason_cat_month)

exit_cat_df = pd.DataFrame(rows, columns=exit_col_names)
    
exit_cat_df.head(10)

In [None]:
exit_cat_df['MONTH'] = pd.to_datetime(exit_cat_df['MONTH'].astype('int').astype("str"), format='%Y%m')
exit_cat_df = exit_cat_df.set_index('MONTH')

exit_cat_df = exit_cat_df.rolling(3).mean()

exit_cat_df.plot(figsize=(16,8))
plt.xlabel("Month", labelpad=14)
plt.ylabel("Number of people exiting", labelpad=14)
plt.title("IEFP Exit Reasons by category", y=1.02)

# Top 5 exit reasons

In [None]:
def map_exit_cat(code):
    cat = exit_map[code]
    return cat

df['exit_cat'] = df['motivo_anulacao'].apply(map_exit_cat)

df_month = df[df['ano_mes'] == 201003.0]
df_month.iloc[:,2:4].groupby(['exit_cat']).sum()['count_1'].tolist()
df_month.iloc[:,2:4].groupby(['exit_cat']).sum()

In [None]:
col_names = ["month", "self_place", "no_notice", "lack_control", "emigration", "center_trans", "employ_prog"]
cols =  [20.0, 11.0, 12.0, 44.0, 80.0, 62.0]

rows = []
for month in months:
    df_month = df[df['ano_mes'] == month]
    freq = []
    for col in cols:
        freq.append(df_month[df_month['motivo_anulacao'] == col]['count_1'].values[0])
    freq.insert(0, month)
    rows.append(freq)
    
reason_df = pd.DataFrame(rows, columns=col_names)
reason_df.head(10)

In [None]:
reason_df['month'] = pd.to_datetime(reason_df['month'].astype('int').astype("str"), format='%Y%m')
reason_df = reason_df.set_index("month")

In [None]:
reason_df = reason_df.rolling(6).mean()

In [None]:
reason_df.plot(figsize=(16,8))

In [None]:
reason_df['year'] = reason_df['month'].apply(lambda x: str(x)[0:4])

# Aggregate statistics

In [None]:
target_table = Table('convocados', meta, autoload=True, autoload_with=con)

# stmt = sqlalchemy.select(col_lst)

stmt = select([target_table])

# target_table.c.motivo_indisponibilidade, target_table.c.sexo, target_table.c.data_nascimento, target_table.c.chabilitacao_escolar, target_table.c.formacao_profissional, target_table.c.qualificacao, target_table.c.cnacionalidade, target_table.c.cdeficiencia, target_table.c.data_nascimento, target_table.c.chabilitacao_escolar, target_table.c.cfreguesia, target_table.c.area_curso, target_table.c.tipo_ocupacao, target_table.c.transporte_proprio, target_table.c.formacao_profissional, target_table.c.qualificacao, target_table.c.estado_civil
stmt = stmt.where(target_table.columns.ute_id == 961010)

con.execute(stmt).fetchall()

df = pd.read_sql(stmt, con)

In [None]:
for col in df.columns: print(col)

In [None]:
df.head(20)

In [None]:
df['ute_type'].value_counts()

In [None]:
df['cnacionalidade'].value_counts()

In [None]:
def is_portuguese(code):
    if code == 'PT': return 1
    else: return 0

In [None]:
df['portuguese'] = df['cnacionalidade'].apply(is_portuguese)

In [None]:
df['portuguese'].value_counts()

In [None]:
df['portuguese'].value_counts().plot(kind='bar')
plt.xlabel("Portuguese or not", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1], ["Portuguese", "Other"])
plt.title("Nationality", y=1.02)

In [None]:
df['cdeficiencia'].value_counts()

In [None]:
def has_dis_code(code):
    if code == 0: return 0
    else: return 1

In [None]:
df['his_disability_code'] = df['cdeficiencia'].apply(has_dis_code)

In [None]:
df['his_disability_code'].value_counts().plot(kind='bar')
plt.xlabel("Recorded disability", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1], ["No disability code", "disability code"])
plt.title("Disability codes", y=1.02)

In [None]:
df['descendentes_a_cargo'].value_counts()

In [None]:
df['descendentes_a_cargo'].value_counts().hist(bins=8)

In [None]:
def dependents(count):
    if count > 8.0: return 8.0
    elif count > 0 and count < 1: return 0
    else: return count

In [None]:
df['descendentes_a_cargo'].value_counts()

In [None]:
df['num_deps'] = df['descendentes_a_cargo'].apply(dependents)

In [None]:
df['num_deps'].value_counts()

In [None]:
df['num_deps'].value_counts().plot.bar()
plt.xlabel("Num of dependants per person", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.title("Number of dependants", y=1.02)

In [None]:
df.groupby(['sexo'])['num_deps'].value_counts().plot.bar()
plt.xlabel("Num of dependants per person", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.title("Number of dependants", y=1.02)

In [None]:
df['salario_pretendido'].value_counts().hist(bins=10)

In [None]:
df['salario_pretendido'].describe()

In [None]:
def salary_outliers(sal):
    if sal < 0: return 0
    elif sal > 800: return 800
    else: return sal

In [None]:
df['sal_no_out'] = df['salario_pretendido'].apply(salary_outliers)

In [None]:
df['sal_no_out'].describe()

In [None]:
df['sal_no_out'].value_counts().hist(bins=10)

In [None]:
df.iloc[0:1,5:6].values[0][0]

In [None]:
df['chabilitacao_escolar'].value_counts()

In [None]:
def edu_level(code):
    if code in ['NS', 'SL']: return 0
    elif code in ['01', '02', '03', '04', '05', '06']: return 1
    elif code in ['07', '08', '09']: return 2
    elif code in ['10', '11', '12']: return 3
    elif code == 'PS': return 4
    elif code in ['BM', 'LC', 'MT']: return 5
    elif code == 'DT': return 6
    else: return 10

In [None]:
df['edu_lev'] = df['chabilitacao_escolar'].apply(edu_level)

In [None]:
df['edu_lev'].value_counts()

In [None]:
df['edu_lev'].hist(bins=7)
plt.xlabel("Educational attainment", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["1st Cycle Primary", "2nd Cycle Primary", "3rd Cycle Middle", "Secondary", "Post Secondary Training", "Bachelors or Masters", "Doctorate"], rotation='vertical')
plt.title("Education levels", y=1.02)

In [None]:
df[df['sexo'] == 'M']['edu_lev'].hist(bins=7)
plt.xlabel("Educational attainment", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["1st Cycle Primary", "2nd Cycle Primary", "3rd Cycle Middle", "Secondary", "Post Secondary Training", "Bachelors or Masters", "Doctorate"], rotation='vertical')
plt.title("Education levels for men", y=1.02)

In [None]:
df[df['sexo'] == 'F']['edu_lev'].hist(bins=7)
plt.xlabel("Educational attainment", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["1st Cycle Primary", "2nd Cycle Primary", "3rd Cycle Middle", "Secondary", "Post Secondary Training", "Bachelors or Masters", "Doctorate"], rotation='vertical')
plt.title("Education levels for women", y=1.02)

In [None]:
df[df['age'] > 30]['edu_lev'].hist(bins=7)
plt.xlabel("Educational attainment", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["1st Cycle Primary", "2nd Cycle Primary", "3rd Cycle Middle", "Secondary", "Post Secondary Training", "Bachelors or Masters", "Doctorate"], rotation='vertical')
plt.title("Education levels for Over 30s", y=1.02)

In [None]:
df[df['age'] < 30]['edu_lev'].hist(bins=7)
plt.xlabel("Educational attainment", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1, 2, 3, 4, 5, 6], ["1st Cycle Primary", "2nd Cycle Primary", "3rd Cycle Middle", "Secondary", "Post Secondary Training", "Bachelors or Masters", "Doctorate"], rotation='vertical')
plt.title("Education levels for Under 30s", y=1.02)

In [None]:
df.head(20)

In [None]:
df['edu_lev'].hist(bins=7)

In [None]:
df['age'] =  df['data_nascimento'].apply(cal_age)

In [None]:
df.boxplot(column=['age'])

In [None]:
def real_age(age):
    if age < 0 or age > 115:
        return 0
    else:
        return 1
    
def cal_age(dob):
    years = (datetime.datetime.now() - dob).days / 365

    return years

In [None]:
df['realistic'] = df['age'].apply(real_age)

In [None]:
df['realistic'].value_counts()

In [None]:
df[df['realistic'] == 1]['age'].hist(bins=10)
plt.xlabel("Age", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.title("Age distribution in database", y=1.02)

In [None]:
df['formacao_profissional'].value_counts().plot.bar()
plt.xlabel("Have Professional Training", labelpad=14)
plt.ylabel("Number of people", labelpad=14)
plt.xticks([0, 1], ["No", "Yes"])
plt.title("Professional Training", y=1.02)