In [1]:
import pandas as pd
import os
import detect_simpsons_paradox as dsp
import numpy as np

We'll first load in some data, this has both regression and rate type trends, since this file has a weird index, we'll load it in as dataframe first

In [2]:
hit_search_rate = pd.read_csv('../data/state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI.csv',index_col='Unnamed: 0')
hit_search_rate.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,search_conducted_rate,contraband_found_false,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,0.032258,31.0,,,1.0,,,31.0
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,0.666667,2.0,1.0,0.333333,1.0,1.0,0.5,3.0
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,0.066667,15.0,,,1.0,,,15.0
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,0.111111,34.0,2.0,0.055556,2.0,2.0,0.5,36.0
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,0.04918,61.0,,,3.0,,,61.0


We can now use the labeledDataFrame with that DataFrame to create the object

In [3]:
labeled_df_setup = dsp.labeledDataFrame(hit_search_rate)

Next, we can infer the variable types

In [4]:
labeled_df_setup.infer_var_types()
labeled_df_setup.meta_df.head()

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,,,
year,int64,ordinal,,,
driver_gender,object,binary,,,
driver_race,object,categorical,,,
decriminalization,int64,binary,,,


For this, we'll manually set these, but in the vizualization tool you can also set these with drop down menus

In [5]:
roles = {'state':['explanatory','groupby'], 'year':'trend', 'driver_gender':['explanatory','groupby'], 'driver_race':['explanatory','groupby'],
       'decriminalization':['explanatory','groupby'], 'medical':['explanatory','groupby'],
         'recreational':['explanatory','groupby'], 'no_reforms':['explanatory','groupby'],
       'search_conducted_false':'trend', 'search_conducted_true':'trend',
       'search_conducted_rate':'trend', 'contraband_found_false':'trend',
       'contraband_found_true':'trend', 'contraband_found_rate':'trend', 'hit_false':'trend',
       'hit_true':'trend', 'hit_rate':'trend', 'num_stops':'trend'}
is_count = {'state':False, 'year':False, 'driver_gender':False, 'driver_race':False,
       'decriminalization':False, 'medical':False, 'recreational':False, 'no_reforms':False,
       'search_conducted_false':True, 'search_conducted_true':True,
       'search_conducted_rate':False, 'contraband_found_false':True,
       'contraband_found_true':True, 'contraband_found_rate':False, 'hit_false':True,
       'hit_true':True, 'hit_rate':False, 'num_stops':True}
count_list = ['search_conducted_false', 'search_conducted_true','contraband_found_false',
       'contraband_found_true', 'hit_false',
       'hit_true', 'num_stops']
weighting = {'hit_rate':'search_conducted_true','search_conducted_rate':'num_stops','contraband_found_rate':'num_stops'}

We'll set those next.  Above gives examples of two ways that we can specify the count values to pass them to the set_counts function, but we'll only call it once below. 

In [6]:
labeled_df_setup.set_counts(count_list)
labeled_df_setup.set_roles(roles)
labeled_df_setup.set_weighting_vars(weighting)
labeled_df_setup.meta_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.meta_df['role'][k] = v


Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[explanatory, groupby]",False,
year,int64,ordinal,trend,False,
driver_gender,object,binary,"[explanatory, groupby]",False,
driver_race,object,categorical,"[explanatory, groupby]",False,
decriminalization,int64,binary,"[explanatory, groupby]",False,
medical,int64,binary,"[explanatory, groupby]",False,
recreational,int64,binary,"[explanatory, groupby]",False,
no_reforms,int64,binary,"[explanatory, groupby]",False,
search_conducted_false,float64,continuous,trend,True,
search_conducted_true,float64,continuous,trend,True,


In [7]:
labeled_df_setup.meta_df.index.name

'variable'

Now, we've set this up, we can also save these configurations to load them in directly in the future

In [8]:
labeled_df_setup.to_csvs('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')

We can see what this does, using a bash magic

In [9]:
%%bash
cd ../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI
ls

df.csv
meta.csv
result_df.csv


it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. 

Now, we can can also load the data back

In [10]:
labeled_df = dsp.labeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[explanatory, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,binary,"[explanatory, groupby]",False,
driver_race,object,categorical,"[explanatory, groupby]",False,
decriminalization,int64,binary,"[explanatory, groupby]",False,
medical,int64,binary,"[explanatory, groupby]",False,
recreational,int64,binary,"[explanatory, groupby]",False,
no_reforms,int64,binary,"[explanatory, groupby]",False,
search_conducted_false,float64,continuous,[trend],True,
search_conducted_true,float64,continuous,[trend],True,


In [11]:
labeled_df.add_all_dpgmm()
labeled_df.df.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,search_conducted_false_search_conducted_true_dpgmm,search_conducted_false_search_conducted_rate_dpgmm,search_conducted_false_contraband_found_false_dpgmm,search_conducted_false_num_stops_dpgmm,search_conducted_true_search_conducted_rate_dpgmm,search_conducted_true_contraband_found_false_dpgmm,search_conducted_true_num_stops_dpgmm,search_conducted_rate_contraband_found_false_dpgmm,search_conducted_rate_num_stops_dpgmm,contraband_found_false_num_stops_dpgmm
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,...,9,0,10,1,3,7,2,5,1,3
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,...,9,5,10,1,2,7,2,0,5,3
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,...,9,6,10,1,3,7,2,5,1,3
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,...,9,6,10,1,0,7,2,7,5,3
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,...,9,6,10,1,3,7,2,5,1,3


In [12]:
labeled_df.meta_df

Unnamed: 0,dtype,var_type,role,isCount,weighting_var
state,object,categorical,"[explanatory, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,binary,"[explanatory, groupby]",False,
driver_race,object,categorical,"[explanatory, groupby]",False,
decriminalization,int64,binary,"[explanatory, groupby]",False,
medical,int64,binary,"[explanatory, groupby]",False,
recreational,int64,binary,"[explanatory, groupby]",False,
no_reforms,int64,binary,"[explanatory, groupby]",False,
search_conducted_false,float64,continuous,[trend],True,
search_conducted_true,float64,continuous,[trend],True,


# Using Trends

Trend objects define their name, how to compute the trend and how to choose which variables, 

extension will allow that the var lists may be passed to reduce which ones are computed

In [13]:
corrobj = dsp.all_pearson()
corrobj.get_trend_vars(labeled_df)
corrobj.regression_vars

['year',
 'search_conducted_false',
 'search_conducted_true',
 'search_conducted_rate',
 'contraband_found_false',
 'contraband_found_true',
 'contraband_found_rate',
 'hit_false',
 'hit_true',
 'hit_rate',
 'num_stops']

In [14]:
rankobj = dsp.mean_rank_trend()
linreg_obj = dsp.linear_trend()

# Computing Trends on a labeledDataFrame

There are two ways, we can use default setting and pass the names of the trend type or a trend object

In [15]:
labeled_df.get_subgroup_trends_1lev(['pearson_corr'])
labeled_df.result_df.head()

Unnamed: 0,feat1,feat2,subgroup_trend,group_feat,subgroup,trend_type,agg_trend
0,year,search_conducted_false,0.299408,state,AZ,pearson_corr,-0.043351
1,year,search_conducted_false,-0.165978,state,CO,pearson_corr,-0.043351
2,year,search_conducted_false,-0.052226,state,CT,pearson_corr,-0.043351
3,year,search_conducted_false,-0.084114,state,IL,pearson_corr,-0.043351
4,year,search_conducted_false,0.118854,state,MA,pearson_corr,-0.043351


Now we can use a list of objects and apply multiple trends

In [16]:
labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trendquality,trend_type,agg_trend,agg_trendquality
1042,search_conducted_false,num_stops,search_conducted_false_search_conducted_rate_d...,3,1.007178,0.999908,lin_reg,1.022399,0.999923
588,search_conducted_false,contraband_found_rate,search_conducted_true_contraband_found_false_d...,9,2.966359e-09,1.0,lin_reg,,
38,search_conducted_false,search_conducted_true,search_conducted_false_search_conducted_true_d...,10,0.0150774,0.936543,lin_reg,0.022399,0.870203
23,search_conducted_false,search_conducted_true,medical,1,0.02724943,0.868391,lin_reg,0.022399,0.870203
3742,contraband_found_false,num_stops,contraband_found_false_num_stops_dpgmm,1,1.001569,0.999973,lin_reg,1.006191,0.999995
1723,search_conducted_true,hit_false,search_conducted_true_num_stops_dpgmm,7,0.7879263,0.916541,lin_reg,,
5611,hit_rate,num_stops,search_conducted_rate_num_stops_dpgmm,1,,,lin_reg,,
4808,contraband_found_rate,num_stops,search_conducted_false_num_stops_dpgmm,0,-6785825.0,-0.593038,lin_reg,,
2172,search_conducted_rate,contraband_found_false,search_conducted_false_contraband_found_false_...,1,-6171945.0,-0.69412,lin_reg,-258844.668084,-0.066422
1276,search_conducted_true,contraband_found_false,no_reforms,0,35.23942,0.880698,lin_reg,34.547343,0.87509


These two methods give the same, the string based version allows for simple access to default setting but passing a trend object would allow for overriding defaults and creating more custom subests of trends.

We can see what types of trends were computed from `result_df`

In [17]:
pd.unique(labeled_df.result_df['trend_type'])

array(['lin_reg'], dtype=object)

The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

In [18]:
labeled_df.trend_list

[<detect_simpsons_paradox.trends.mean_rank_trend at 0x7f4ed6621358>,
 <detect_simpsons_paradox.trends.linear_trend at 0x7f4ed6621a90>]

In [19]:
# labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
labeled_df.add_distance()

In [20]:
labeled_df.result_df

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trendquality,trend_type,agg_trend,agg_trendquality,distance
0,search_conducted_false,search_conducted_true,state,AZ,3.961149e-02,0.886950,lin_reg,0.022399,0.870203,0.985229
1,search_conducted_false,search_conducted_true,state,CO,3.647263e-03,0.839356,lin_reg,0.022399,0.870203,1.074185
2,search_conducted_false,search_conducted_true,state,CT,1.366987e-02,0.898090,lin_reg,0.022399,0.870203,0.499980
3,search_conducted_false,search_conducted_true,state,IL,2.865804e-02,0.817280,lin_reg,0.022399,0.870203,0.358378
4,search_conducted_false,search_conducted_true,state,MA,1.490722e-02,0.854907,lin_reg,0.022399,0.870203,0.429099
5,search_conducted_false,search_conducted_true,state,MD,1.509907e-02,0.717101,lin_reg,0.022399,0.870203,0.418109
6,search_conducted_false,search_conducted_true,state,NC,6.422877e-03,0.827750,lin_reg,0.022399,0.870203,0.915158
7,search_conducted_false,search_conducted_true,state,RI,3.297309e-02,0.889068,lin_reg,0.022399,0.870203,0.605378
8,search_conducted_false,search_conducted_true,state,SC,2.363005e-02,0.866358,lin_reg,0.022399,0.870203,0.070493
9,search_conducted_false,search_conducted_true,state,TX,2.323853e-02,0.888991,lin_reg,0.022399,0.870203,0.048074


Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

In [21]:
labeled_df.trend_list[0].trend_precompute

{}