# TODO
* in association of students with ACM, filter for active students. They are already naturally filtered for students who have received >0 minutes of tutoring. Alternatively, define 'active' as a student who has received >200 mins tutoring, or create some composite score that scales student assessment performance by tutoring time received
* teacher surveys
    * associate ACMs with teachers
* Conditions for success
* more survey items that were interesting from Q1
* incorporate Q2 and Q3 survey

# Next Steps/Cycle
1. decide attributes
    * dimensionality reduction, represent dataset with less data,but less transparency (PCA)
    * norm responses within individual responses (z-score seems standard here)
    * all if I can norm/encode programatically, then research feature selection scoring methods
    * use intuition/attributes I know are important
* Decide what scoring method to best select attributes
* Create decision tree
* Test other targets (measures of ACM effectiveness)

# Analysis Questions
* visualize tutoring time against growth
* visualize O&C scores against growth
* visualize growth against test date (whether baseline came from prior year or from fall)
* do SYACMs have greater impact? do they have greater impact when working with the same students?

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
# import pymc3 as pm
import seaborn as sns; sns.set_context('notebook')
os.chdir(r'Z:\ChiPrivate\Chicago Reports and Evaluation\SY18\Eval Management\ACM_analysis')

# Load shaped tables

In [None]:
# Student-level: tutoring time sum by student-program and associated to staff id via sections
tut_time_df = pd.read_csv('time_on_task_2017-12-11.csv')
# Student-level: Assessment performance vs target, no student-staff associations
assmt_df = pd.read_csv('assessment_growth.csv')
# ACM-level: each coaching instance YTD
heatmaps_df = pd.read_csv('OC_clean.csv')
# ACM-level: coaching data aggregated and normed, up to December
heatmaps_df_agg = pd.read_csv('OC_clean_agg.csv')
# ACM-level: Surveys
survey_df = pd.read_csv('ACM_surveys.csv')
# ACM-level: Projected Commute Time
commutes_df = pd.read_csv('commutes_cleaned.csv')

In [None]:
# merge assessments to tutoring time (including student-staff associations)
assmt_df['Assessment Type'] = assmt_df['Assessment Type'].str.replace('NWEA - ELA', 'Tutoring: Literacy')
assmt_df['Assessment Type'] = assmt_df['Assessment Type'].str.replace('NWEA - MATH', 'Tutoring: Math')
assmt_df['Key'] = assmt_df['Student__c'] + assmt_df['Assessment Type']
del assmt_df['Student__c']
tut_time_df['Key'] = tut_time_df['Student__c'] + tut_time_df['Program__c_Name']
impact_df = assmt_df.merge(tut_time_df, on='Key')
impact_df = impact_df.loc[impact_df['ToT_sum']>=90]

In [None]:
impact_df.head()

In [None]:
# # experiment with scaling assessment growth by amount of time served by ACM
# impact_df.loc[impact_df['Hit_Target?']==1, 'Score_scaled_tot'] = impact_df['Amount_of_Time__c_YTD']
# impact_df.loc[impact_df['Hit_Target?']==0, 'Score_scaled_tot'] = -1*impact_df['Amount_of_Time__c_YTD']

response_i,c=α+β∗predictor_i,c+ϵ

In [None]:
data = impact_df[['Staff__c', 'ToT_mean', 'Growth_v_Target']]
data = data.sort_values('Staff__c')
data.reset_index(drop=True, inplace=True)
data.loc[:, 'Staff__c_code'] = data.Staff__c.map(dict(zip(data.Staff__c.unique(), list(range(0, len(data.Staff__c.unique()))))))
acms_idx = data.Staff__c_code

In [None]:
with pm.Model() as hierarchical_model:
    # Hyperpriors
    mu_a = pm.Normal('mu_alpha', mu=0., sd=1)
    sigma_a = pm.HalfCauchy('sigma_alpha', beta=1)
    mu_b = pm.Normal('mu_beta', mu=0., sd=1)
    sigma_b = pm.HalfCauchy('sigma_beta', beta=1)
    
    # Intercept for each county, distributed around group mean mu_a
    a = pm.Normal('alpha', mu=mu_a, sd=sigma_a, shape=len(data.Staff__c.unique()))
    # Intercept for each county, distributed around group mean mu_a
    b = pm.Normal('beta', mu=mu_b, sd=sigma_b, shape=len(data.Staff__c.unique()))
    
    # Model error
    eps = pm.HalfCauchy('eps', beta=1)
    
    # Expected value
    growth_est = a[acms_idx] + b[acms_idx] * data.ToT_mean.values
    
    # Data likelihood
    y_like = pm.Normal('y_like', mu=growth_est, sd=eps, observed=data.Growth_v_Target)

In [None]:
with hierarchical_model:
    hierarchical_trace = pm.sample(njobs=2)

In [None]:
pm.traceplot(hierarchical_trace);

In [None]:
impact_df['ToT_sum'].hist()

In [None]:
impact_df['ToT_count'].hist()

In [None]:
impact_df['ToT_mean'].hist()

In [None]:
impact_df['Growth_v_Target'].describe()

## Does Tutoring Time Relate to Assessment Growth?

In [None]:
sns.lmplot(data=impact_df, x='ToT_mean', y='Growth_v_Target', hue='Staff__c', legend=False, size=6, ci=False)

In [None]:
sns.lmplot(data=impact_df.loc[impact_df['Staff__c']=='a1L1a0000035cbTEAQ'], x='ToT_count', y='Growth_v_Target', legend=False, hue='Staff__c', size=6, ci=False)

In [None]:
bin_size = 20
bin_range = (impact_df['ToT_sum'].max() - impact_df['ToT_sum'].min())/bin_size
impact_df.loc[:, 'ToT_sum_binned'] = pd.cut(impact_df['ToT_sum'], bin_size, labels=False)
impact_df.loc[:, 'ToT_sum_binned'] = (impact_df.loc[:, 'ToT_sum_binned']+1) * bin_range
sns.lmplot(data=impact_df, x='ToT_sum_binned', y='Growth_v_Target', order=3)

In [None]:
sns.lmplot(data=impact_df, x='ToT_sum', y='Hit_Target?', hue='Staff__c', size=10, legend=False, ci=False)

In [None]:
# logistic=True failed
sns.lmplot(data=impact_df, x='ToT_sum', y='Hit_Target?', hue='Staff__c', size=10, legend=False, ci=False)

In [None]:
# # group to ACM level
# group1_df = impact_df.groupby('Staff__c').agg(['sum', 'mean', 'std', 'count']).reset_index()
# group1_df.columns = [' '.join(col).strip() for col in group1_df.columns.values]
# group2_df = impact_df.groupby('Staff__c')['School__c'].first().reset_index()
# impact_df = group1_df.merge(group2_df, on='Staff__c')

In [None]:
# impact_df['Score_scaled_tot'].hist()

### Experimenting with Different Scoring methods to aggregate student performance by ACM
Place greater value on N students who met target, penalize for N students who missed. [Graph](https://academo.org/demos/3d-surface-plotter/?expression=y%5E1.5-x%5E1.3&xRange=0%2C%2B12&yRange=0%2C%2B12&resolution=12)

In [None]:
impact_df['N Hit Target'] = impact_df['Hit_Target? mean'] * impact_df['Hit_Target? count']
impact_df['N Not Hit Target'] = impact_df['Hit_Target? count'] - impact_df['N Hit Target']
impact_df['Score'] = impact_df['N Hit Target']**1.5 - impact_df['N Not Hit Target']**1.3
# z-score normalization
impact_df['Score'] = (impact_df['Score'] - impact_df['Score'].mean()) / impact_df['Score'].std()

In [None]:
impact_df['Score'].hist()

In [None]:
sns.lmplot(data=impact_df, x='School__c', y='Amount_of_Time__c_YTD mean', hue='Score', palette="RdBu", fit_reg=False)

In [None]:
sns.lmplot(x="School__c", y="Score", data=impact_df)

In [None]:
plt.xticks(rotation=90)
sns.stripplot(x="School__c", y="Score", data=impact_df)

In [None]:
len(tut_time_df.loc[tut_time_df.Amount_of_Time__c_YTD<200])

In [None]:
# mean_df.reset_index(inplace=True)
# df = mean_df.merge(survey_df, on='Staff__c')

In [None]:
# from sklearn_pandas import DataFrameMapper
# import numpy as np
# import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_selection import SelectKBest, chi2

# mapper = DataFrameMapper([('NPS', sklearn.preprocessing.LabelBinarizer()),
#                           ('Growth_v_Target mean', None)], df_out=True)

# mapper

# mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
# mapper_fs.fit_transform(data[['children','salary']], data['Growth_v_Target mean'])

# from sklearn import tree

# X = [[0, 0], [1, 1]]
# Y = [0, 1]
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(X, Y)

In [None]:
sns.jointplot(x='Score', y="Amount_of_Time__c_YTD mean", data=impact_df, kind="kde")

In [None]:
sns.jointplot(x="Growth_v_Target mean", y="Amount_of_Time__c_YTD mean", data=impact_df, kind="kde", xlim=(-20, 20), ylim=(0, 1200))

In [None]:
mean_df[("Growth_v_Target", "mean")].hist()

In [None]:
mean_df[("Hit_Target?", "mean")].hist()

# Compare O&C to Target

In [None]:
oc_df = heatmaps_df_agg.merge(impact_df[['Staff__c', 'Score', 'Hit_Target? mean']], on='Staff__c')

In [None]:
sns.pairplot(oc_df, x_vars=['Plan Rating', 'ET Rating','ESE Rating','SPM Rating','Learn Rating'], 
             y_vars=['Score'], kind="reg", hue='Coach', size=6)

In [None]:
sns.pairplot(oc_df, x_vars=[col for col in oc_df.columns if '_norm' in col], 
             y_vars=['Score'], kind="reg", hue='Coach', size=6)

In [None]:
sns.pairplot(oc_df, x_vars=[col for col in oc_df.columns if '_norm' in col], 
             y_vars=['Score'], kind="reg", size=6)

# Compare Survey to Commutes

In [None]:
surv_commute_df = survey_df.copy()
surv_commute_df = surv_commute_df.merge(commutes_df, on='Staff__c', how='left')

surv_commute_df.loc[:, 'Q2_var51O96'] = surv_commute_df.loc[:, 'Q2_var51O96'].map({'Checked':1, 'Unchecked':0})
surv_commute_df.loc[:, 'Q3_var31'] = pd.to_numeric(surv_commute_df['Q3_var31'], errors='coerce')
surv_commute_df.loc[surv_commute_df['Commute.Time'] > 900, 'Commute.Time'] = np.nan
surv_commute_df.loc[surv_commute_df['Q3_var31'] > 900, 'Q3_var31'] = np.nan

In [None]:
pryr_commute = pd.read_excel('FY17 Corps Housing Survey.xlsx')

In [None]:
# Self-Reported Commute minus Predicted Commute
binwidth = 15
minx=0
maxx=150
plt.xticks(range(minx, maxx, binwidth))
data = pryr_commute['How long is your commute to your school (one way)?']
data.hist(bins=range(minx, maxx, binwidth), normed=True)
data = surv_commute_df.loc[(~surv_commute_df['Q3_var31'].isnull() &
                            ~surv_commute_df['Commute.Time'].isnull()),
                           'Q3_var31']
ax = data.hist(bins=range(minx, maxx, binwidth), alpha=.6, normed=True)

vals = ax.get_yticks()
ax.set_yticklabels(['{:3.0f}%'.format(x*1000) for x in vals])

In [None]:
# Likelihood of Listing Commute as a Challenge (Y axis) vs. Predicted Commute (X axis)
sns.lmplot(x='Commute.Time', y='Q2_var51O96', data=surv_commute_df, logistic=True)

In [None]:
# Likelihood of Listing Commute as a Challenge (Y axis) vs. Self-Reported Commute (X axis)
sns.lmplot(x='Q3_var31', y='Q2_var51O96', data=surv_commute_df, logistic=True)

In [None]:
surv_commute_df.loc[:, 'Actual_vs_Predicted'] = surv_commute_df['Q3_var31'] - surv_commute_df['Commute.Time']

In [None]:
surv_commute_df['Q3_var32'].unique()

In [None]:
# Self-Reported Commute minus Predicted Commute
binwidth = 10
plt.xticks(range(-100, 100, 20))
data = surv_commute_df.loc[(~surv_commute_df['Q3_var31'].isnull() &
                            ~surv_commute_df['Commute.Time'].isnull() &
                            surv_commute_df['Q3_var32'].str.contains('Car')), 
                           'Actual_vs_Predicted']
data.hist(bins=range(-100, 100, binwidth))
data = surv_commute_df.loc[(~surv_commute_df['Q3_var31'].isnull() &
                            ~surv_commute_df['Commute.Time'].isnull() & 
                            surv_commute_df['Q3_var32'].str.contains('Public transit')),
                           'Actual_vs_Predicted']
data.hist(bins=range(-100, 100, binwidth), alpha=.6)

In [None]:
# Self-Reported Commute and Predicted Commute
binwidth = 10
plt.xticks(range(0, 200, 20))
data = surv_commute_df.loc[(~surv_commute_df['Q3_var31'].isnull() & ~surv_commute_df['Commute.Time'].isnull()), 'Commute.Time']
data.hist(bins=range(0, 175, binwidth))
data = surv_commute_df.loc[(~surv_commute_df['Q3_var31'].isnull() & ~surv_commute_df['Commute.Time'].isnull()), 'Q3_var31']
data.hist(alpha=0.6, bins=range(0, 175, binwidth))

# Compare Surveys to Target

In [None]:
survey_df = survey_df.merge(impact_df[['Staff__c', 'Score', 'Hit_Target? mean']], on='Staff__c')

In [None]:
plt.xticks(rotation=90)
sns.swarmplot(x="Educational.Attainment", y="Score", data=survey_df)

In [None]:
# var5	var31	var76	var77	var79	var80	var85	var86	var87	var88
sns.jointplot(x="var88", y="Score", data=survey_df, kind='reg')

In [None]:
sns.swarmplot(x="var88", y="Score", data=survey_df)

In [None]:
survey_df.loc[survey_df['Tutoring.Experience.Months'].isnull(), 'Tutoring.Experience.Months'] = 0

In [None]:
sns.lmplot(x="Tutoring.Experience.Months", y="Score", data=survey_df, order=2)

In [None]:
plt.xticks(rotation=90)
sns.swarmplot(x="var31", y="Score", data=survey_df)

In [None]:
sns.regplot(x="Age", y="Score", data=survey_df)