Job Center level objectives:
- Average number of counsellors per center (and SD)
- Average number of different interventions per centre (and SD)
- Top nth (5) interventions per center
- Average number of job seekers per center 


In [None]:
import os
import yaml
import pandas as pd
import seaborn as sns
from pyathena import connect
from pyathena.util import as_pandas
from sqlalchemy import *

%matplotlib inline
%load_ext autoreload

In [None]:
pg_cred = yaml.load(open("../conf/local/credentials.yml"), Loader=yaml.FullLoader)

In [None]:
url = 'postgresql://{}:{}@{}:{}/{}'
url = url.format(pg_cred["pg_user"], pg_cred["pg_pass"], pg_cred["pg_host"], 5432, "iefp")

# The return value of create_engine() is our connection object
con = create_engine(url, client_encoding='utf8')

# We then bind the connection to MetaData()
meta = MetaData(bind=con, reflect=True)

## Count No. of counsellors per center

In [None]:
sql = """ 
select centro, count(distinct(utilizador))
from pedidos
group by centro 
;"""

counsellors_center = pd.read_sql(sql, con)

In [None]:
counsellors_center.head()

In [None]:
counsellors_center['count'].mean()

In [None]:
counsellors_center['count'].min()

In [None]:
counsellors_center['count'].max()

In [None]:
counsellors_center.shape 

In [None]:
# plot distribution of counsellors by center

# order by count

order = counsellors_center.sort_values(by= 'count', ascending=False, inplace=False)
order.head()


In [None]:
g = sns.barplot(order['centro'].astype('category'),order['count'])

## Average number of different interventions per centre (and SD)

In [None]:
# one for where movement code happened (occured)
# and one for where the perosn is registered (recommended) 

In [None]:
# CENTRO_MOVIMENTO              
sql = """ 
select centro_movimento, count(codigo_intervencao)

from intervencoes

where ((intervencoes.tipo_movimento > 30) 
and intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by centro_movimento
;"""

interv_center = pd.read_sql(sql, con)

In [None]:
interv_center.sort_values(by= 'count', ascending=False, inplace=True)

In [None]:
interv_center.head()

In [None]:
interv_center.dtypes

In [None]:
interv_center.shape

In [None]:
interv_center.columns = ['centro','mov_count']

In [None]:
interv_center['centro']=interv_center['centro'].astype('category')

In [None]:
# CENTRO                                                
sql = """ 
select centro, count(codigo_intervencao)

from intervencoes

where ((intervencoes.tipo_movimento > 30) 
and intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by centro
;"""

interv_center_r = pd.read_sql(sql, con)

In [None]:
interv_center_r.sort_values(by= 'count', ascending=False, inplace=True)

In [None]:
interv_center_r.centro = interv_center_r['centro'].astype('int')

In [None]:
interv_center.centro = interv_center['centro'].astype('int')

In [None]:
interv_center.centro = interv_center.centro.astype('int')

In [None]:
interv_center.drop("center", axis=1, inplace=True)

In [None]:
#merge together
df_merge = interv_center_r.merge(interv_center,on='centro')
df_merge.head()

In [None]:
interv_center["centro"] = interv_center.center.astype(int)

In [None]:
interv_center.dtypes

In [None]:
df_merge = interv_center_r.merge(interv_center, left_on="centro", right_on="centro", how="inner")

In [None]:
df_merge.head()

In [None]:
df_merge['centro'] = df_merge['centro'].astype('category')

In [None]:
df_merge = df_merge.set_index("centro")

In [None]:
df_merge.columns = ['Registered','Movement']

In [None]:
import matplotlib as plt
ax = df_merge.plot(figsize=(12,8))
ax.set(xlabel="Center Number", ylabel="Number of interventions")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
interv_center.reset_index(level=0, inplace=True)


In [None]:
interv_center.head()

In [None]:
interv_center['centro'] = interv_center['centro'].astype('category')

In [None]:
interv_center2 = interv_center.set_index("centro")

In [None]:
interv_center2.columns = ['Movement']

In [None]:
import matplotlib as plt
ax = interv_center2.plot(figsize=(12,8))
ax.set(xlabel="Center Number", ylabel="Number of interventions")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
# interv_center_r
interv_center_r.head()

In [None]:
interv_center_r['centro'] = interv_center_r['centro'].astype('category')

In [None]:
interv_center_r2 = interv_center_r.set_index("centro")

In [None]:
interv_center_r2.columns = ['Registration']

In [None]:
import matplotlib as plt
ax = interv_center_r2.plot(figsize=(12,8))
ax.set(xlabel="Center Number", ylabel="Number of interventions")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
# centro but different codes per centre

In [None]:
sql = """ 
select centro, tipo_movimento, count(codigo_intervencao)

from intervencoes

where (
intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by centro, tipo_movimento
;"""

interv_types = pd.read_sql(sql, con)

In [None]:
interv_types.head()

In [None]:
interv_types['centro'] = interv_types['centro'].astype('category')

In [None]:
interv_types['tipo_movimento'] = interv_types['tipo_movimento'].astype('category')

In [None]:
interv_types = interv_types.set_index("centro")

In [None]:
interv_types.head()

In [None]:
interv_types['centro'] = interv_types['centro'].astype('int64')

In [None]:
interv_center.reset_index(level=0, inplace=True)

In [None]:
interv_types.tipo_movimento = interv_types.tipo_movimento.astype('int64')

In [None]:
interv_types2 = interv_types # copy

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 25] = 'INTERVENTIONS'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 26] = 'REFERRALS'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 27] = 'REFUSALS'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 29] = 'IOP'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 35] = 'OUTCOME'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 36] = 'RESULT_REFERRALS'

In [None]:
interv_types.tipo_movimento[interv_types.tipo_movimento == 39] = 'RESULT_IOP'

In [None]:
interv_types.tipo_movimento.value_counts()

In [None]:
pivot_df = interv_types.pivot(index='centro', columns='tipo_movimento', values='count')
pivot_df.head()

In [None]:
ax = pivot_df.plot.bar(stacked=True, figsize=(16,8))
ax.set(xlabel="Center Number", ylabel="Number of interventions")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
interv_types.head()

In [None]:
# plot referals against interventions
types_comp = interv_types[(interv_types.tipo_movimento == 'REFERRALS') | (interv_types.tipo_movimento =='INTERVENTIONS')]
types_comp.head()

In [None]:
pivot_df2 = types_comp.pivot(index='centro', columns='tipo_movimento', values='count')
pivot_df2.head()

In [None]:
ax = pivot_df2.plot.bar(stacked=True, figsize=(16,8))
ax.set(xlabel="Center Number", ylabel="Number of interventions")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
#covert to df to do % /diff plot
flattened = pd.DataFrame(pivot_df2.to_records())

In [None]:
flattened.head()

In [None]:
flattened['total'] = flattened['INTERVENTIONS'] + flattened['REFERRALS']

In [None]:
flattened['int_perc'] = (flattened['INTERVENTIONS'] / flattened['total']) *100

In [None]:
flattened['ref_perc'] = (flattened['REFERRALS'] / flattened['total']) *100

In [None]:
import numpy as np
f = flattened.iloc[:,np.r_[0,4,5]]
f.head()

In [None]:
f = f.set_index("centro")

In [None]:
f.sort_values(by= 'int_perc',ascending=False, inplace=True)

In [None]:
f.columns = ['Interventions','Referral']

In [None]:
ax = f.plot.bar(stacked=True, figsize=(16,8))
ax.set(xlabel="Center Number", ylabel="% of intervention type")
ax.get_yaxis().set_major_formatter(
    plt.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))

In [None]:
# no. of people per center:

sql = """ 
select centro_intervencao, count(codigo_intervencao)

from intervencoes

where ((intervencoes.tipo_movimento > 30) 
and intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by centro_intervencao
;"""

interv_center_i = pd.read_sql(sql, con)

In [None]:
# other center codes: ccentro_ins, ccentro 

In [None]:
#CODIGO_INTERVENCAO                                                 
sql = """ 
select centro_intervencao, count(codigo_intervencao)

from intervencoes

where ((intervencoes.tipo_movimento > 30) 
and intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by centro_intervencao
;"""

interv_center_i = pd.read_sql(sql, con)

In [None]:
interv_center_i.sort_values(by= 'count', ascending=False, inplace=True)

In [None]:
interv_center_i.head()

In [None]:
# With to make it faster?

WITH temporaryTable (averageValue) as
    (SELECT avg(Attr1)
    FROM Table),
    SELECT Attr1
    FROM Table
    WHERE Table.Attr1 > temporaryTable.averageValue;


sql = """ 

WITH temp as 

(select distinct ute_id, centro, utilizador from pedidos),

select centro, count(distinct(counsellors))

group by centro 
from pedidos

where ((intervencoes.tipo_movimento > 30) 

and intervencoes.codigo_intervencao != '0101' 
and intervencoes.codigo_intervencao != '0102')

group by ano_mes

order by ano_mes

;"""



interv_time = pd.read_sql(sql, con)