In [1]:
from sqlalchemy import create_engine, event
import urllib.parse

import pandas as pd
import datetime as dt
import numpy as np
import time
import matplotlib.pylab as plt

# Input & Output

In [2]:
# SQL database
server = 'CSKMA0400\RDB_Data'
db = 'JLDJobPath'
odbc_connection_string = 'DRIVER={SQL Server Native Client 11.0};SERVER='+server+';DATABASE='+db+';Trusted_Connection=yes'

sql_table = 'linkedclaims_casuals_2018m04_v2_flat_20161001_with_income_36Vars__7BGM_full_clusters_jp_summary_with_outcomes_edu_selected_jld'

cluster_var = 'cluster'
cluster_targets = [0,1,2,3,4,5,6]

eligible_var ='JP Eligible'
eligible_target_var = 'duration_days_0'
eligible_target_var_min_val = 11*30

age1_var = 'Over 50'
age1_target_var = 'age'
age1_target_var_min_val = 50
age1_target_var_max_val = 99

jp_category_var = 'jobpath_category_in_aw'

outcome_var = 'detailed_outcome_32m'
outcome_targets = ['On Live Register (excluding casual workers) - JA', 'On Live Register (excluding casual workers) - JB']

path = '//cskma0294/F/Evaluations/JobPath/Python/Analysis/JPOutcomes/'

# Procedure: Data Prep

In [3]:
def read_data_from_sql(sql_table):
    # Connect to SQL
    params = urllib.parse.quote_plus(odbc_connection_string)
    engine = create_engine('mssql+pyodbc:///?odbc_connect=%s' % params)
    conn = engine.connect().connection

    @event.listens_for(engine, 'before_cursor_execute')
    def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
        if executemany:
            cursor.fast_executemany = True

    df = pd.read_sql_table(sql_table, engine)
    conn.close()
    return df


data = read_data_from_sql(sql_table)

data [eligible_var] = 0
data.loc[data[eligible_target_var] >= eligible_target_var_min_val, eligible_var] = 1

data [age1_var] = 0
data.loc[(data[age1_target_var] > age1_target_var_min_val) &
         (data[age1_target_var] < age1_target_var_max_val), 
         age1_var] = 1

data_elegible = data.loc[(data[eligible_var] == 1)]

data_elegible_over50 = data.loc[(data[eligible_var] == 1) & data[age1_var] == 1]

# Crosstabs: General Overview

In [None]:
res_outcome = pd.crosstab(index=data[outcome_var],
                          columns=data[cluster_var],
                         margins=True)
res_outcome.to_csv(path+sql_table+'_outcomes_vs_cluster.csv')

res_clusters_elegible = pd.crosstab(index=data[eligible_var],
                                    columns=data[cluster_var],
                                    margins=True)
res_clusters_elegible.to_csv(path+sql_table+'_jp_elegible_vs_cluster.csv')

res_cluster_age1 = pd.crosstab(index=data[age1_var],
                               columns=data[cluster_var],
                               margins=True)
res_cluster_age1.to_csv(path+sql_table+'_'+age1_var+'_vs_cluster.csv')

res_jp_category = pd.crosstab(index=data[jp_category_var],
                              columns=data[cluster_var],
                              margins=True)
res_jp_category.to_csv(path+sql_table+'_'+'jp_category_vs_cluster.csv')

res_jp_category_vs_outcomes = pd.crosstab(index=data[jp_category_var],
                              columns=data[outcome_var],
                              margins=True)
res_jp_category_vs_outcomes.to_csv(path+sql_table+'_'+'jp_category_vs_outcomes.csv')

# Crosstabs: JP Elegible

In [None]:
res_outcome = pd.crosstab(index=data_elegible[outcome_var],
                          columns=data_elegible[cluster_var],
                          margins=True)
res_outcome.to_csv(path+sql_table+'_jp_elegible-outcomes_vs_cluster.csv')

res_cluster_age1 = pd.crosstab(index=data_elegible[age1_var],
                               columns=data_elegible[cluster_var],
                               margins=True)
res_cluster_age1.to_csv(path+sql_table+'_jp_elegible-'+age1_var+'_vs_cluster.csv')

res_jp_category = pd.crosstab(index=data_elegible[jp_category_var],
                              columns=data_elegible[cluster_var],
                              margins=True)
res_jp_category.to_csv(path+sql_table+'_jp_elegible-jp_category_vs_cluster.csv')

res_jp_category_vs_outcomes = pd.crosstab(index=data_elegible[jp_category_var],
                                          columns=data_elegible[outcome_var],
                                          margins=True)
res_jp_category_vs_outcomes.to_csv(path+sql_table+'_jp_elegible-jp_category_vs_outcomes.csv')

# Crosstabs: JP Elegible & Over 50 (age1_var)

In [None]:
res_outcome = pd.crosstab(index=data_elegible_over50[outcome_var],
                          columns=data_elegible_over50[cluster_var],
                          margins=True)
res_outcome.to_csv(path+sql_table+'_jp_elegible_Over 50-outcomes_vs_cluster.csv')

res_jp_category = pd.crosstab(index=data_elegible_over50[jp_category_var],
                              columns=data_elegible_over50[cluster_var],
                              margins=True)
res_jp_category.to_csv(path+sql_table+'_jp_elegible_Over 50-jp_category_vs_cluster.csv')

res_jp_category_vs_outcomes = pd.crosstab(index=data_elegible_over50[jp_category_var],
                                          columns=data_elegible_over50[outcome_var],
                                          margins=True)
res_jp_category_vs_outcomes.to_csv(path+sql_table+'_jp_elegible_Over 50-jp_category_vs_outcomes.csv')

In [4]:
res_jp_category = pd.crosstab(index=data_elegible[jp_category_var],
                              columns=data_elegible[cluster_var],
                              margins=True)
res_jp_category

cluster,0,1,2,3,4,5,6,All
jobpath_category_in_aw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,2690,23847,23994,4608,3822,4250,12326,75537
Q1 complete,355,2904,7141,1063,515,187,805,12970
Q2 complete,353,3680,4196,975,445,166,635,10450
Q3 complete,275,2925,3068,897,319,137,437,8058
Q4 complete,212,2508,1988,678,260,118,335,6099
cancelled before start in aw,149,1296,2366,340,164,91,248,4654
cancelled in aw started during aw,95,835,1069,249,121,45,137,2551
completed jp before aw,2,15,68,3,2,0,3,93
no jp before or after aw,88,830,1465,222,98,40,173,2916
started before aw completed during aw,572,4722,16487,1679,918,362,1614,26354


In [5]:
elegible_no_jp = data_elegible.loc[data_elegible['jobpath_category_in_aw']  == 'None'] 
elegible_no_jp[['ppsn','jp_flag_after_aw','jp_flag_before_aw','jp_flag_after_aw']]

Unnamed: 0,ppsn,jp_flag_after_aw,jp_flag_before_aw,jp_flag_after_aw.1
0,6502683I,,,
1,6505789Q,,,
2,5983447A,,,
3,5984988Q,,,
10,6029992K,,,
13,5944087A,,,
17,5939050T,,,
18,5996555B,,,
19,5996618W,,,
20,5956667L,,,
