In [None]:
%pylab inline
import pandas as pd
import seaborn as sns
import os 
import hivanalysis as hiv
from sqlalchemy import create_engine
import hivanalysis.performance_figures

In [None]:
dburl = os.environ['DBURL']
engine = create_engine(dburl)

# Overall Performance 

In [None]:
#grab the best random forest model
best_random_forest = 20602

In [None]:
df_rf_20602_10pct = hiv.get_model_performance(best_random_forest,engine)
df_rf_20602_5pct = hiv.get_model_performance(best_random_forest,engine,parameter='5.0_pct')
df_prior = hiv.get_model_performance(19054,engine)
df_baseline_slr = hiv.get_model_performance(19513,engine)
df_19004 = hiv.get_model_performance(19004,engine)
df_dt_20615 = hiv.get_model_performance(20615,engine)

In [None]:
sns.set_style("whitegrid")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
plt.ylim(0,.4)
plt.ylabel('precision@10%')

plt.plot(df_rf_20602_10pct.evaluation_start_time,df_rf_20602_10pct.value, 
         color='green', marker='o', linestyle='-',linewidth=6, label='rf_20602_10pct')

plt.plot(df_prior.evaluation_start_time,df_prior.value, 
         color='grey', marker='o', linestyle='-',linewidth=6, label='prior')

plt.plot(df_baseline_slr.evaluation_start_time,df_baseline_slr.value, 
         color='black', marker='o', linestyle='-',linewidth=6, label='baseline')

plt.plot(df_dt_20615.evaluation_start_time,df_dt_20615.value, 
         color='red', marker='o', linestyle='-',linewidth=6, label='dt_20615_10pct')


#plt.plot(df_19004.evaluation_start_time,df_19004.value, 
#         color='red', marker='o', linestyle='-',linewidth=6, label='19004_model_group')


plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=4, borderaxespad=0.)
sns.despine()


In [None]:
sns.set_style("white")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
plt.ylim(0,.4)
plt.ylabel('precision@10%')

plt.plot(df_rf_20602_10pct.evaluation_start_time,df_rf_20602_10pct.value, 
         color='blue', marker='o', linestyle='-',linewidth=6, label='model')

plt.plot(df_prior.evaluation_start_time,df_prior.value, 
         color='grey', marker='o', linestyle='-',linewidth=10, label='baserate')

plt.plot(df_baseline_slr.evaluation_start_time,df_baseline_slr.value, 
         color='black', marker='o', linestyle='-',linewidth=6, label='expert rule')

#plt.plot(df_dt_20615.evaluation_start_time,df_dt_20615.value, 
#         color='red', marker='o', linestyle='-',linewidth=6, label='dt_20615_10pct')


#plt.plot(df_19004.evaluation_start_time,df_19004.value, 
#         color='red', marker='o', linestyle='-',linewidth=6, label='19004_model_group')


plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=4, borderaxespad=0.)
sns.despine()


In [None]:
sns.set_style("white")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25,"lines.markersize":20})
plt.ylim(0,.4)
plt.ylabel('Fraction of Appts Correctly Identified')

plt.plot(df_rf_20602_10pct.evaluation_start_time,df_rf_20602_10pct.value, 
         color='blue', marker='o', linestyle='-',linewidth=6, label='model')

plt.plot(df_baseline_slr.evaluation_start_time,df_baseline_slr.value, 
         color='black', marker='o', linestyle='-',linewidth=6, label='expert rule')

plt.plot(df_prior.evaluation_start_time,df_prior.value, 
         color='grey', marker='o', linestyle='-',linewidth=10, label='random')




plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=4, borderaxespad=0.)
sns.despine()

plt.xlabel('year')
plt.savefig('efficiency_ucm.png')

In [None]:
model_group_id = 20602
q=f"""
select
    evaluation_start_time,
    value
from
    test_results.evaluations
where
    model_id in (select distinct model_id from model_metadata.models where model_group_id = {model_group_id})
    and metric = 'recall@'
    and parameter = '10.0_pct'
    and evaluation_start_time < '2016-01-01'::date
"""
df_recall_20602 = pd.read_sql(q,engine)

In [None]:
model_group_id = 19054
q=f"""
select
    evaluation_start_time,
    value
from
    test_results.evaluations
where
    model_id in (select distinct model_id from model_metadata.models where model_group_id = {model_group_id})
    and metric = 'recall@'
    and parameter = '10.0_pct'
    and evaluation_start_time < '2016-01-01'::date
"""
df_recall_19054 = pd.read_sql(q,engine)

In [None]:
model_group_id = 19513
q=f"""
select
    evaluation_start_time,
    value
from
    test_results.evaluations
where
    model_id in (select distinct model_id from model_metadata.models where model_group_id = {model_group_id})
    and metric = 'recall@'
    and parameter = '10.0_pct'
    and evaluation_start_time < '2016-01-01'::date
"""
df_recall_19513 = pd.read_sql(q,engine)

In [None]:
sns.set_style("white")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25,"lines.markersize":20})
plt.ylim(0,.4)
plt.ylabel('recall@10%')

plt.plot(df_recall_20602.evaluation_start_time,df_recall_20602.value, 
         color='red', marker='o', linestyle='-',linewidth=6, label='model')

plt.plot(df_recall_19054.evaluation_start_time,df_recall_19054.value, 
         color='grey', marker='o', linestyle='-',linewidth=10, label='baserate')

plt.plot(df_recall_19513.evaluation_start_time,df_recall_19513.value, 
         color='black', marker='o', linestyle='-',linewidth=6, label='expert rule')


plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=3, borderaxespad=0.)
sns.despine()


In [None]:
sns.set_style("white")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
plt.ylim(0,.4)
plt.ylabel('Coverage of At-Risk Appointments')

plt.plot(df_recall_20602.evaluation_start_time,df_recall_20602.value, 
         color='red', marker='o', linestyle='--',linewidth=6, label='model')



plt.plot(df_recall_19513.evaluation_start_time,df_recall_19513.value, 
         color='black', marker='o', linestyle='--',linewidth=6, label='expert rule')

plt.plot(df_recall_19054.evaluation_start_time,df_recall_19054.value, 
         color='grey', marker='o', linestyle='--',linewidth=10, label='random')

plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=3, borderaxespad=0.)
sns.despine()
plt.xlabel('year')

In [None]:
sns.set_style("whitegrid")
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
plt.ylim(0,.4)
plt.ylabel('recall@10%')

plt.plot(df_recall_20602.evaluation_start_time,df_recall_20602.value, 
         color='green', marker='o', linestyle='--',linewidth=6, label='rf_20602')

plt.plot(df_recall_19054.evaluation_start_time,df_recall_19054.value, 
         color='grey', marker='o', linestyle='--',linewidth=6, label='prior')

plt.plot(df_recall_19513.evaluation_start_time,df_recall_19513.value, 
         color='black', marker='o', linestyle='--',linewidth=6, label='baseline')


plt.legend(bbox_to_anchor=(0., 1.005, 1., .102), loc=7,ncol=3, borderaxespad=0.)
sns.despine()

In [None]:
df_recall_20602

In [None]:
q='select model_id, train_end_time from model_metadata.models where model_group_id=20602;'
df_model_ids = pd.read_sql(q,engine)

In [None]:
df_model_ids

In [None]:
import hivanalysis.feature_figures as hiv_feature

In [None]:
hiv_feature.plot_top_n_features(83735,topn=20)

In [None]:
top_n_features = hiv_feature.plot_top_n_features(83734, topn=20)

In [None]:
for mdl_id in df_model_ids.model_id:
    print(mdl_id)
    hiv_feature.plot_top_n_features(mdl_id, topn=40)

In [None]:
hivanalysis.performance_figures.plot_scoredist_and_pr_at_k(83735)

In [None]:
df_train = hiv.get_matrix('73e6f09c403e0eea413e6b9049e7c749')
df_train['as_of_date']=pd.to_datetime(df_train['as_of_date'])

In [None]:
df_retention_max = df_train[['entity_id','as_of_date','retention_entity_id_1day_consecutive_retention_max','outcome']]

In [None]:
q='select entity_id, as_of_date, score, percent_rank() over (order by score desc) from test_results.predictions where model_id = 83735;'
df_score = pd.read_sql(q,engine,parse_dates=['as_of_date'])

In [None]:
df_train_score = pd.merge(df_score,df_train,on=['entity_id','as_of_date'])
df_train_score['ground'] = df_train_score.outcome.apply(lambda x:'Drop-Out' if x else 'Retained')

In [None]:
#grab all the retention features
retention_feature_names = [col for col in df_train_score.columns if col.startswith('retention')]

In [None]:
retention_feature_names

In [None]:
df_retention_max = df_train_score[['entity_id','as_of_date','score','percent_rank','outcome','retention_entity_id_1day_consecutive_retention_max']].copy()

In [None]:
df_retention_max.head()

In [None]:
df_retention_max['ground'] = df_retention_max.outcome.apply(lambda x:'Drop-Out' if x else 'Retained')

In [None]:
def bin_retention_days(days_retention):
    if days_retention == 0:
        return 0
    if days_retention == 1:
        return 1
    if days_retention > 1 and days_retention < 365:
        return 365
    if days_retention > 365 and days_retention < 730:
        return 730
    if days_retention > 730 and days_retention < 1095:
        return 1095
    if days_retention > 1095:
        return 1460

In [None]:
df_retention_max['bin_days']=df_retention_max['retention_entity_id_1day_consecutive_retention_max'].apply(bin_retention_days)

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_retention_max['bin_days'].value_counts(normalize=True).plot(kind='bar')

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_retention_max.groupby('outcome').retention_entity_id_1day_consecutive_retention_max.hist(alpha=0.4)

# Consecutive Days Retained

In [None]:
fig, ax = plt.subplots(1,2,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_retention_max[df_retention_max.outcome==1].retention_entity_id_1day_consecutive_retention_max.hist(color='red',ax=ax[0])
df_retention_max[df_retention_max.outcome==0].retention_entity_id_1day_consecutive_retention_max.hist(color='blue',ax=ax[1])

Both retained and unretained have populations that have been retained for several years. It might be possible that these people that are unretained are just seeking care elsewhere or they are truly dropped out of care and this means that a history of retention doesn't necessarily mean you will be retained. 

In [None]:
mask = (df_retention_max.outcome == 1) & (df_retention_max.retention_entity_id_1day_consecutive_retention_max > 730)


In [None]:
df_errors_highly_retained_dropouts = df_retention_max[mask].copy()

In [None]:
df_errors_highly_retained_dropouts['years_retained']=df_errors_highly_retained_dropouts.retention_entity_id_1day_consecutive_retention_max.apply(lambda x: x/365.)

In [None]:
df_errors_highly_retained_dropouts

17 people that are not retained that are False Positives

In [None]:
sel_cols = ['entity_id','as_of_date','percent_rank','years_retained','ground']
df_errors_highly_retained_dropouts[sel_cols].to_csv('highly_retained_dropouts.csv',index=False)

In [None]:
df_retention_max.outcome.value_counts()

In [None]:
df_train_score[df_train_score.percent_rank < 0.10].outcome.value_counts()

In [None]:
mask = (df_train_score.percent_rank > 0.10) & (df_train_score.retention_entity_id_1day_consecutive_retention_max < 730) & (df_train_score.outcome==1) 
sel_features = ['entity_id',
                'as_of_date',
                'score',
                'percent_rank',
                'outcome',
                'retention_entity_id_1day_consecutive_retention_max',
                'idprevappts_entity_id_6months_days_bn_appts_avg']
df_missing60 = df_train_score[mask][sel_features].sort_values(by='score').copy()

In [None]:
df_missing60.head()

In [None]:
x1=df_missing60.retention_entity_id_1day_consecutive_retention_max
x2=df_missing60.idprevappts_entity_id_6months_days_bn_appts_avg

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
plt.plot(x1,x2,linestyle='',marker='o',markersize=8)
plt.ylabel('6 months days between appointments')
plt.xlabel('consecutive_days_retention')

In [None]:
df_missing60.plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=100,
                  figsize=(48,24))

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan_cluster = DBSCAN(eps=75,min_samples=5)

In [None]:
dbscan_cluster

In [None]:
cols_cluster = ['retention_entity_id_1day_consecutive_retention_max',
                'idprevappts_entity_id_6months_days_bn_appts_avg']
clusters = dbscan_cluster.fit(df_missing60[cols_cluster])

In [None]:
df_missing60['cluster'] = pd.Series(clusters.labels_+2, index=df_missing60.index)

In [None]:
len(df_missing60['cluster'].unique())

In [None]:
colormap = {1:'red',2:'blue',3:'green',4:'black',5:'purple',6:'orange',7:'brown',8:'grey'}

In [None]:
colormap[1]

In [None]:
df_missing60['cluster_color']=df_missing60['cluster'].apply(lambda x: colormap[x])

In [None]:
df_missing60.cluster.unique()

In [None]:
df_missing60[df_missing60.cluster == 1]

In [None]:
df_missing60.plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  c=df_missing60['cluster_color'])

In [None]:
mask = (df_train_score.percent_rank > 0.10) & (df_train_score.retention_entity_id_1day_consecutive_retention_max < 730) 
sel_features = ['entity_id',
                'as_of_date',
                'score',
                'percent_rank',
                'outcome',
                'retention_entity_id_1day_consecutive_retention_max',
                'idprevappts_entity_id_6months_days_bn_appts_avg']
df_missing_everybody = df_train_score[mask][sel_features].sort_values(by='score').copy()

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_missing_everybody[df_missing_everybody.outcome==0].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='x',
                  ax=ax)
df_missing_everybody[df_missing_everybody.outcome==1].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='o',
                  ax=ax)


In [None]:
cols_cluster = ['retention_entity_id_1day_consecutive_retention_max',
                'idprevappts_entity_id_6months_days_bn_appts_avg']
dbscan_cluster = DBSCAN(eps=25,min_samples=5)
clusters = dbscan_cluster.fit(df_missing_everybody[cols_cluster])
df_missing_everybody['cluster'] = pd.Series(clusters.labels_+2, index=df_missing_everybody.index)
df_missing_everybody['cluster_color'] = df_missing_everybody['cluster'].apply(lambda x: colormap[x])

In [None]:
df_missing_everybody['cluster_color'].unique()

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_missing_everybody[df_missing_everybody.outcome==0].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='x',
                  ax=ax,
                  c=df_missing_everybody[df_missing_everybody.outcome==0]['cluster_color'])
df_missing_everybody[df_missing_everybody.outcome==1].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='o',
                  ax=ax,
                  c=df_missing_everybody[df_missing_everybody.outcome==1]['cluster_color'])

In [None]:
fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_missing_everybody[df_missing_everybody.outcome==0].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='x',
                  ax=ax,
                  c='DarkBlue')
df_missing60.plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  ax=ax,
                  c=df_missing60['cluster_color'])

In [None]:
df_missing_everybody[df_missing_everybody.cluster==2].head()

In [None]:
#make a dt over the features
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=2)

In [None]:
sel_features=['retention_entity_id_1day_consecutive_retention_max']
X=df_retention_max[sel_features].values
y=df_retention_max.outcome.values
mdl = clf.fit(X,y)

In [None]:
from IPython.display import Image
def visualize_tree(classifier,feature_cols,proportion=True):
    from sklearn.tree import export_graphviz
    export_graphviz(classifier,
                    out_file='tree.dot',
                    feature_names=feature_cols,
                    rounded=True,
                    proportion=proportion,
                    filled=True,
                    precision=2)
    import pydotplus
    graph = pydotplus.graph_from_dot_file('tree.dot')
    graph.write_png('tree.png')
    return 'tree.png'
    

In [None]:
Image(visualize_tree(mdl,sel_features))

In [None]:
sel_features=retention_feature_names.copy()
X=df_train_score[sel_features].values
y=df_train_score.outcome.values
mdl = clf.fit(X,y)
Image(visualize_tree(mdl,sel_features))

In [None]:
mask = df_retention_max.percent_rank < 0.10
df_retention_max[mask].groupby('ground')['retention_entity_id_1day_consecutive_retention_max'].value_counts()

In [None]:
mask = df_retention_max.retention_entity_id_1day_consecutive_retention_max < 2 
df=df_retention_max[mask].groupby('ground')['retention_entity_id_1day_consecutive_retention_max'].value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(24, 8))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df.unstack(level=0).plot(kind='bar',color=['red','green'],ax=ax[0])
sns.boxplot(x='ground',y='retention_entity_id_1day_consecutive_retention_max',data=df_train_score,ax=ax[1])

Although, all the drop-outs in the top 10% are 0 and 1 for consecutive days of retention, there is the same proporiton of people retained that are both 0 and 1. 

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3)
sel_features = ['retention_entity_id_1day_consecutive_retention_max',
               'idprevappts_entity_id_6months_days_bn_appts_avg',
               'retention_entity_id_1day_n_days_last_appt_max']
X=df_train_score[sel_features].values
y=df_train_score.outcome.values
mdl = clf.fit(X,y)
Image(visualize_tree(mdl,sel_features))

In [None]:
fig, ax = plt.subplots(1,figsize=(12, 12))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='retention_entity_id_1day_consecutive_retention_max',data=df_train_score)

# N Changes in Retention 

In [None]:
fig, ax = plt.subplots(1,2,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_train_score[df_train_score.outcome==1].retention_entity_id_1day_n_changes_retention_max.hist(color='red',ax=ax[0])
df_train_score[df_train_score.outcome==0].retention_entity_id_1day_n_changes_retention_max.hist(color='blue',ax=ax[1])
ax[0].set_xlabel('drop-out n_drop_retention')
ax[1].set_xlabel('retained n_drop_retention')


In [None]:
from scipy.stats import ttest_ind

In [None]:
ttest_ind(df_train_score[df_train_score.outcome==1].retention_entity_id_1day_n_changes_retention_max.values,
         df_train_score[df_train_score.outcome==0].retention_entity_id_1day_n_changes_retention_max.values)

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=1)
sel_features=['retention_entity_id_1day_n_changes_retention_max']
X=df_train_score[sel_features].values
y=df_train_score.outcome.values
mdl = clf.fit(X,y)
Image(visualize_tree(mdl,sel_features))

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=2)
sel_features=['retention_entity_id_1day_n_changes_retention_max']
X=df_train_score[sel_features].values
y=df_train_score.outcome.values
mdl = clf.fit(X,y)
Image(visualize_tree(mdl,sel_features,proportion=False))

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=3)
sel_features=['retention_entity_id_1day_n_changes_retention_max']
X=df_train_score[sel_features].values
y=df_train_score.outcome.values
mdl = clf.fit(X,y)
Image(visualize_tree(mdl,sel_features,proportion=True))

In [None]:
fig, ax = plt.subplots(1,figsize=(12, 12))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_train_score.groupby('outcome').retention_entity_id_1day_n_changes_retention_max.hist(alpha=0.3)

In [None]:
df_train_score['key'] = df_train_score['entity_id'].apply(lambda x: str(x)) + '_' + df_train_score['as_of_date'].apply(lambda x: str(x).split()[0])

In [None]:
df_train_score['ground'] = df_train_score.outcome.apply(lambda x:'Drop-Out' if x else 'Retained')

In [None]:
fig, ax = plt.subplots(1,figsize=(12, 12))
sns.set_context("poster", font_scale=.5, rc={"lines.linewidth": 1.25,"lines.markersize":18})
pd.crosstab(df_train_score.ground,df_train_score.retention_entity_id_1day_retained_max).plot(kind='barh',ax=ax)

In [None]:
df_train_score.groupby('ground')['retention_entity_id_1day_n_days_last_appt_max'].describe()

In [None]:
df_top5 = df_compare_scores[sel_features+['triage_score','outcome']].sort_values(by='triage_score',ascending=False)[:top5]

fig, ax = plt.subplots(1,figsize=(48, 24))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
df_top5[df_top5.outcome==1].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='o',
                  ax=ax)
df_top5[df_top5.outcome==0].plot(kind='scatter',
                  x='retention_entity_id_1day_consecutive_retention_max',
                  y='idprevappts_entity_id_6months_days_bn_appts_avg',
                  s=300,
                  figsize=(48,24),
                  marker='x',
                  ax=ax)
plt.ylim(0,4000)from scipy.stats import ttest_ind

In [None]:
ttest_ind(df_train_score[df_train_score.outcome==1]['retention_entity_id_1day_n_days_last_appt_max'].values,
         df_train_score[df_train_score.outcome==0]['retention_entity_id_1day_n_days_last_appt_max'].values)

In [None]:
fig, ax = plt.subplots(1,figsize=(12, 12))
sns.set_context("poster", font_scale=2, rc={"lines.linewidth": 1.25,"lines.markersize":18})
mask = df_train_score['retention_entity_id_1day_n_days_last_appt_max'] < 730
df_train_score[mask].groupby('outcome').retention_entity_id_1day_n_days_last_appt_max.hist(alpha=0.4)
plt.xlim(0,730)

In [None]:
fig, ax = plt.subplots(1,figsize=(12, 12))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='retention_entity_id_1day_n_days_last_appt_max',data=df_train_score,sym='')

# ID Previous Appointments

## Questions
- What is the difference between days_between appointments for retained and drop-outs over time.
- What is the difference between completed appointments for retained and drop-outs over time. 

In [None]:
ls_idprevappts_features = [col for col in df_train_score.columns if col.startswith('idprevappts')]

In [None]:
ls_idprevappts_imp = [col for col in ls_idprevappts_features if col.endswith('imp')]

In [None]:
len(ls_idprevappts_imp)

In [None]:
fig, ax = plt.subplots(8,3,figsize=(54, 54))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
for i,col in enumerate(ls_idprevappts_imp):
    rowi=int(floor(i/3))
    coli=int(i%3)
    pd.crosstab(df_train_score['ground'],df_train_score[col]).plot(kind='barh',legend=True,ax=ax[rowi,coli],label=col)
    plt.title(col)

In [None]:
df_top10=df_train_score[df_train_score.percent_rank < 0.10].copy()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(14, 5))
sns.set_context("poster", font_scale=.5, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='idprevappts_entity_id_6months_days_bn_appts_avg',data=df_top10,sym='',ax=ax[0])
sns.boxplot(x='ground',y='idprevappts_entity_id_6months_days_bn_appts_avg',data=df_train_score,ax=ax[1])

In [None]:
df_top10['idprevappts_entity_id_6months_days_bn_appts_avg'].describe()

In [None]:
df_top10['idprevappts_entity_id_6months_days_bn_appts_avg_imp'].describe()

In [None]:
sns.set_context("poster", font_scale=.4, rc={"lines.linewidth": 1.25,"lines.markersize":18})
pd.crosstab(df_top10['ground'],df_top10['idprevappts_entity_id_6months_days_bn_appts_avg_imp']).plot(kind='bar')

In [None]:
#what about the distribution for people that do not have the feature imputed 
fig, ax = plt.subplots(1,2,figsize=(16, 8))
sns.set_context("poster", font_scale=.4, rc={"lines.linewidth": 1.25,"lines.markersize":18})
mask = df_train_score['idprevappts_entity_id_6months_days_bn_appts_avg_imp'] < 1
sns.boxplot(x='ground',y='idprevappts_entity_id_6months_days_bn_appts_avg',data=df_train_score[mask],ax=ax[0])
sns.boxplot(x='ground',y='idprevappts_entity_id_6months_days_bn_appts_avg',data=df_train_score[mask],sym='',ax=ax[1])

In [None]:
mask = (df_train_score['idprevappts_entity_id_6months_days_bn_appts_min'] == df_train_score['idprevappts_entity_id_6months_days_bn_appts_max'])
mask.value_counts(normalize=True)

In [None]:
mask = (df_train_score['idprevappts_entity_id_1year_days_bn_appts_min'] == df_train_score['idprevappts_entity_id_1year_days_bn_appts_max'])
mask.value_counts(normalize=True)

In [None]:
mask = (df_train_score['idprevappts_entity_id_3years_days_bn_appts_min'] == df_train_score['idprevappts_entity_id_3years_days_bn_appts_max'])
mask.value_counts(normalize=True)

In [None]:
mask = (df_train_score['idprevappts_entity_id_all_days_bn_appts_min'] == df_train_score['idprevappts_entity_id_all_days_bn_appts_max'])
mask.value_counts(normalize=True)

Days between feature for six months have 40% of the values where there is only 1 or 0 appointments. Largely driven by imputation. This is why min, max and avg have similar predictive value. 

In [None]:
#what is the distribution of days between appointments over all time. 
order = ['Retained','Drop-Out']
fig, ax = plt.subplots(1,4,figsize=(36, 12))
sns.set_context("poster", font_scale=.75, rc={"lines.linewidth": 1.25,"lines.markersize":18})
mask = df_train_score['idprevappts_entity_id_all_days_bn_appts_avg_imp'] < 1
sns.boxplot(x='ground',y='idprevappts_entity_id_all_days_bn_appts_avg',data=df_train_score[mask],ax=ax[0],sym='',order=order)
mask = df_train_score['idprevappts_entity_id_3years_days_bn_appts_avg_imp'] < 1
sns.boxplot(x='ground',y='idprevappts_entity_id_3years_days_bn_appts_avg',data=df_train_score[mask],ax=ax[1],sym='',order=order)
mask = df_train_score['idprevappts_entity_id_1year_days_bn_appts_avg_imp'] < 1
sns.boxplot(x='ground',y='idprevappts_entity_id_1year_days_bn_appts_avg',data=df_train_score[mask],ax=ax[2],sym='',order=order)
mask = df_train_score['idprevappts_entity_id_6months_days_bn_appts_avg_imp'] < 1
sns.boxplot(x='ground',y='idprevappts_entity_id_6months_days_bn_appts_avg',data=df_train_score[mask],ax=ax[3],sym='',order=order)


In [None]:
mask = df_train_score['idprevappts_entity_id_all_days_bn_appts_avg_imp'] == 0 
df_train_score[mask].groupby('ground')['idprevappts_entity_id_all_days_bn_appts_avg'].describe()

In [None]:
mask1 = (df_train_score['idprevappts_entity_id_all_days_bn_appts_avg_imp'] == 0) & (df_train_score['outcome'] ==1.)
mask0 = (df_train_score['idprevappts_entity_id_all_days_bn_appts_avg_imp'] == 0) & (df_train_score['outcome'] ==0)

In [None]:
ttest_ind(df_train_score[mask1]['idprevappts_entity_id_all_days_bn_appts_avg'].values,
         df_train_score[mask0]['idprevappts_entity_id_all_days_bn_appts_avg'].values)

# Viral Load Features
- Are Viral Load Features just proxies for imputation

In [None]:
ls_vl_features = [col for col in df_train_score.columns if col.startswith('vl')]

In [None]:
ls_vl_features_imp = [col for col in ls_vl_features if col.endswith('imp')]

In [None]:
fig, ax = plt.subplots(14,3,figsize=(54, 72))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
for i,col in enumerate(ls_vl_features_imp):
    rowi=int(floor(i/3))
    coli=int(i%3)
    pd.crosstab(df_train_score['ground'],df_train_score[col]).plot(kind='barh',legend=True,ax=ax[rowi,coli],label=col)
    plt.title(col)

In [None]:
ls_vl_features

In [None]:
df_top10.groupby('ground')['vl_entity_id_3years_virally_supressed_sum'].describe()

In [None]:
df_train_score.groupby('ground')['vl_entity_id_3years_virally_supressed_sum'].describe()

In [None]:
df_top10['vl_entity_id_3years_virally_supressed_sum_imp']
pd.crosstab(df_top10['ground'],df_top10['vl_entity_id_3years_virally_supressed_sum_imp'])

In [None]:
order = ['Retained','Drop-Out']
fig, ax = plt.subplots(2,4,figsize=(42, 24))
sns.set_context("poster", font_scale=.75, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='vl_entity_id_all_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0,0])
sns.boxplot(x='ground',y='vl_entity_id_3years_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0,1])
sns.boxplot(x='ground',y='vl_entity_id_1years_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0,2])
sns.boxplot(x='ground',y='vl_entity_id_6months_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0,3])
sns.boxplot(x='ground',y='vl_entity_id_all_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1,0])
sns.boxplot(x='ground',y='vl_entity_id_3years_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1,1])
sns.boxplot(x='ground',y='vl_entity_id_1years_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1,2])
sns.boxplot(x='ground',y='vl_entity_id_6months_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1,3])


In [None]:
fig, ax = plt.subplots(1,2,figsize=(16, 8))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='vl_entity_id_all_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0])
sns.boxplot(x='ground',y='vl_entity_id_all_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1])


In [None]:
fig, ax = plt.subplots(1,2,figsize=(16, 8))
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='vl_entity_id_3years_virally_supressed_sum',data=df_top10,sym='',order=order,ax=ax[0])
sns.boxplot(x='ground',y='vl_entity_id_3years_virally_supressed_sum',data=df_train_score,sym='',order=order,ax=ax[1])


In [None]:
fig, ax = plt.subplots(1,2,figsize=(16, 8))
sns.set_context("poster", font_scale=0.5, rc={"lines.linewidth": 1.25,"lines.markersize":18})
sns.boxplot(x='ground',y='vl_entity_id_all_vl_gt_100k_sum',data=df_top10,sym='',order=order,ax=ax[0])
sns.boxplot(x='ground',y='vl_entity_id_all_vl_gt_100k_sum',data=df_train_score,sym='',order=order,ax=ax[1])


In [None]:
df_top10[['retention_entity_id_1day_consecutive_retention_max','idprevappts_entity_id_6months_days_bn_appts_avg']]

In [None]:
df_top10['retention_entity_id_1day_consecutive_retention_max'] > 1

In [None]:
df_top10['idprevappts_entity_id_6months_days_bn_appts_avg'].value_counts()

In [None]:
df_top10[df_top10.outcome==0]['retention_entity_id_1day_n_days_last_appt_max']

In [None]:
df_train_score.outcome.value_counts()

In [None]:
133*0.33

In [None]:
def grab_model_train(model_hash, train_hash):
    logging.info('model_hash: {}'.format(model_hash))
    logging.info('train_hash: {}'.format(train_hash))
    model = hiv.get_model(model_hash)
    logging.debug(model)
    df_train = hiv.get_matrix(train_hash)
    df_train = df_train.set_index(['entity_id','as_of_date'])
    df_train.drop('outcome',axis=1,inplace=True)
    feature_cols = df_train.columns
    return model, df_train

In [None]:
import logging

In [None]:
mdl,df_train = grab_model_train('006dedffc978bf7280e588b3a8c9bfda','ad3a3809810f2ded31640ffaca84adc3')

In [None]:
sel_features = df_train.columns 
Image(visualize_tree(mdl,sel_features,proportion=True))