In [1]:
import pandas as pd
import os
import wiggum as wg
import numpy as np

We'll first load in some data, this has both regression and rate type trends, since this file has a weird index, we'll load it in as dataframe first

In [2]:
hit_search_rate = pd.read_csv('../data/state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI.csv',index_col='Unnamed: 0')
hit_search_rate.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,search_conducted_rate,contraband_found_false,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,0.032258,31.0,,,1.0,,,31.0
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,0.666667,2.0,1.0,0.333333,1.0,1.0,0.5,3.0
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,0.066667,15.0,,,1.0,,,15.0
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,0.111111,34.0,2.0,0.055556,2.0,2.0,0.5,36.0
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,0.04918,61.0,,,3.0,,,61.0


We can now use the LabeledDataFrame with that DataFrame to create the object

In [3]:
labeled_df_setup = wg.LabeledDataFrame(hit_search_rate)

Next, we can infer the variable types

In [4]:
labeled_df_setup.infer_var_types()
labeled_df_setup.meta_df.head()

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,,,
year,int64,ordinal,,,
driver_gender,object,binary,,,
driver_race,object,categorical,,,
decriminalization,int64,binary,,,


For this, we'll manually set these, but in the vizualization tool you can also set these with drop down menus

In [5]:
roles = {'state':['trend','groupby'], 'year':'trend', 'driver_gender':['trend','groupby'], 
         'driver_race':['trend','groupby'],
       'decriminalization':['groupby'], 'medical':['groupby'],
         'recreational':['groupby'], 'no_reforms':['groupby'],
       'search_conducted_false':'ignore', 'search_conducted_true':'ignore',
       'search_conducted_rate':'trend', 'contraband_found_false':'ignore',
       'contraband_found_true':'ignore', 'contraband_found_rate':'trend', 'hit_false':'ignore',
       'hit_true':'ignore', 'hit_rate':'trend', 'num_stops':'trend'}
is_count = {'state':False, 'year':False, 'driver_gender':False, 'driver_race':False,
       'decriminalization':False, 'medical':False, 'recreational':False, 'no_reforms':False,
       'search_conducted_false':True, 'search_conducted_true':True,
       'search_conducted_rate':False, 'contraband_found_false':True,
       'contraband_found_true':True, 'contraband_found_rate':False, 'hit_false':True,
       'hit_true':True, 'hit_rate':False, 'num_stops':True}
count_list = ['search_conducted_false', 'search_conducted_true','contraband_found_false',
       'contraband_found_true', 'hit_false',
       'hit_true', 'num_stops']
var_types = {'driver_gender':'categorical','decriminalization':'categorical', 
             'medical':'categorical', 'recreational':'categorical', 'no_reforms':False,}
weighting = {'hit_rate':'search_conducted_true','search_conducted_rate':'num_stops',
             'contraband_found_rate':'num_stops'}

We'll set those next.  Above gives examples of two ways that we can specify the count values to pass them to the set_counts function, but we'll only call it once below. 

In [6]:
labeled_df_setup.set_counts(count_list)
labeled_df_setup.set_roles(roles)
labeled_df_setup.set_weighting_vars(weighting)
labeled_df_setup.set_var_types(var_types)
labeled_df_setup.meta_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.meta_df['role'][k] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.meta_df['var_type'][k] = v


Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,trend,False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,binary,[groupby],False,
medical,int64,binary,[groupby],False,
recreational,int64,binary,[groupby],False,
no_reforms,int64,binary,[groupby],False,
search_conducted_false,float64,continuous,ignore,True,
search_conducted_true,float64,continuous,ignore,True,


Now, we've set this up, we can also save these configurations to load them in directly in the future

In [7]:
labeled_df_setup.to_csvs('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')

We can see what this does, using a bash magic

In [8]:
%%bash
cd ../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI
ls

df.csv
meta.csv
result_df.csv


it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. 

Now, we can can also load the data back

In [9]:
labeled_df = wg.LabeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,binary,[groupby],False,
medical,int64,binary,[groupby],False,
recreational,int64,binary,[groupby],False,
no_reforms,int64,binary,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [10]:
labeled_df.add_all_dpgmm(qual_thresh =.2)
labeled_df.df.head()



Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,hit_true,hit_rate,num_stops,search_conducted_false_search_conducted_true_dpgmm,search_conducted_false_search_conducted_rate_dpgmm,search_conducted_false_contraband_found_false_dpgmm,search_conducted_false_num_stops_dpgmm,search_conducted_true_contraband_found_false_dpgmm,search_conducted_true_num_stops_dpgmm,contraband_found_false_num_stops_dpgmm
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,...,,,31.0,0,5,0,8,0,1,8
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,...,1.0,0.5,3.0,0,0,0,8,0,1,8
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,...,,,15.0,0,0,0,8,0,1,8
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,...,2.0,0.5,36.0,0,0,0,8,0,1,8
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,...,,,61.0,0,0,0,8,0,1,8


In [11]:
labeled_df.meta_df

Unnamed: 0,dtype,var_type,role,isCount,weighting_var
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,binary,[groupby],False,
medical,int64,binary,[groupby],False,
recreational,int64,binary,[groupby],False,
no_reforms,int64,binary,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [12]:
labeled_df.add_quantile(['hit_rate','num_stops'])

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,num_stops,search_conducted_false_search_conducted_true_dpgmm,search_conducted_false_search_conducted_rate_dpgmm,search_conducted_false_contraband_found_false_dpgmm,search_conducted_false_num_stops_dpgmm,search_conducted_true_contraband_found_false_dpgmm,search_conducted_true_num_stops_dpgmm,contraband_found_false_num_stops_dpgmm,hit_ratequantiles,num_stopsquantiles
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,...,31.0,0,5,0,8,0,1,8,high,low
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,...,3.0,0,0,0,8,0,1,8,high,low
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,...,15.0,0,0,0,8,0,1,8,high,low
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,...,36.0,0,0,0,8,0,1,8,high,low
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,...,61.0,0,0,0,8,0,1,8,high,low
5,AZ,2010,F,Black,0,1,0,0,224.0,8.0,...,232.0,0,5,0,8,0,1,8,high,low
6,AZ,2010,F,Hispanic,0,1,0,0,557.0,33.0,...,590.0,0,0,0,8,0,1,8,low,low
7,AZ,2010,F,Other,0,1,0,0,167.0,6.0,...,173.0,0,5,0,8,0,1,8,high,low
8,AZ,2010,F,White,0,1,0,0,3145.0,92.0,...,3237.0,0,5,0,8,0,1,8,low,mid
9,AZ,2010,M,Asian,0,1,0,0,97.0,1.0,...,98.0,0,5,0,8,0,1,8,high,low


In [13]:
labeled_df.meta_df

Unnamed: 0,dtype,var_type,role,isCount,weighting_var
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,binary,[groupby],False,
medical,int64,binary,[groupby],False,
recreational,int64,binary,[groupby],False,
no_reforms,int64,binary,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


# Using Trends

Trend objects define their name, how to compute the trend and how to choose which variables, 

extension will allow that the var lists may be passed to reduce which ones are computed

In [14]:
corrobj = wg.All_Pearson()
corrobj.get_trend_vars(labeled_df)
corrobj.regression_vars

['year',
 'search_conducted_rate',
 'contraband_found_rate',
 'hit_rate',
 'num_stops']

In [15]:
rankobj = wg.Mean_Rank_Trend()
linreg_obj = wg.All_Linear_Trend()

# Computing Trends on a LabeledDataFrame

There are two ways, we can use default setting and pass the names of the trend type or a trend object

In [16]:
labeled_df.get_subgroup_trends_1lev(['pearson_corr'])
labeled_df.result_df.head()

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_quality,group_feat,subgroup,trend_type,agg_trend,agg_trend_quality
0,year,search_conducted_rate,-0.247018,0.247018,state,AZ,pearson_corr,-0.03903,0.03903
1,year,search_conducted_rate,-0.414566,0.414566,state,CO,pearson_corr,-0.03903,0.03903
2,year,search_conducted_rate,0.118238,0.118238,state,CT,pearson_corr,-0.03903,0.03903
3,year,search_conducted_rate,-0.199765,0.199765,state,IL,pearson_corr,-0.03903,0.03903
4,year,search_conducted_rate,-0.603026,0.603026,state,MA,pearson_corr,-0.03903,0.03903


Now we can use a list of objects and apply multiple trends

In [17]:
labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

  slope = r_num / ssxm
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality
58,search_conducted_rate,driver_gender,search_conducted_false_num_stops_dpgmm,4,"[F, M]",0.3907,rank_trend,"[F, M]",0.2519
1053,hit_rate,state,search_conducted_false_search_conducted_rate_d...,0,"[NC, WA, SC, IL, MD, AZ, CT, RI, MA, WI, VT]",0.1599,rank_trend,"[WA, NC, IL, AZ, MD, SC, TX, CT, RI, MA, WI, C...",0.221
192,search_conducted_rate,driver_race,search_conducted_true_num_stops_dpgmm,11,"[White, Black]",0.7009,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.2772
39,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_rate_d...,1,"[F, M]",0.233,rank_trend,"[F, M]",0.2519
660,num_stops,driver_gender,search_conducted_false_search_conducted_true_d...,7,"[M, F]",0.0769,rank_trend,"[F, M]",0.1164
1119,hit_rate,state,num_stopsquantiles,low,"[NC, WA, IL, AZ, MD, CT, SC, TX, RI, MA, CO, W...",0.2206,rank_trend,"[WA, NC, IL, AZ, MD, SC, TX, CT, RI, MA, WI, C...",0.221
1560,year,num_stops,decriminalization,0,637.751,0.025548,lin_reg,-1560.86,0.043965
1015,contraband_found_rate,state,contraband_found_false_num_stops_dpgmm,5,[TX],,rank_trend,"[NC, CO, MD, WA, CT, SC, TX, WI, IL, MA, VT, A...",0.0201
1550,year,num_stops,state,VT,271.488,0.044411,lin_reg,-1560.86,0.043965
995,contraband_found_rate,state,search_conducted_true_contraband_found_false_d...,9,"[IL, AZ, SC, MA, TX]",0.7629,rank_trend,"[NC, CO, MD, WA, CT, SC, TX, WI, IL, MA, VT, A...",0.0201


These two methods give the same, the string based version allows for simple access to default setting but passing a trend object would allow for overriding defaults and creating more custom subests of trends.

We can see what types of trends were computed from `result_df`

In [18]:
pd.unique(labeled_df.result_df['trend_type'])

array(['rank_trend', 'lin_reg'], dtype=object)

The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

In [19]:
labeled_df.trend_list

[<detect_simpsons_paradox.trends.Mean_Rank_Trend at 0x7fd077f8b940>,
 <detect_simpsons_paradox.trends.All_Linear_Trend at 0x7fd077f8bb38>]

In [20]:
# labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
labeled_df.add_distance()

labeled_df.result_df.sample(10)

  size * (size - 1) * (size - 2))


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance
1867,search_conducted_rate,num_stops,state,IL,-134391,0.066876,lin_reg,-257201,0.065594,2e-06
559,hit_rate,driver_race,search_conducted_false_search_conducted_rate_d...,0,"[Asian, Hispanic, Other, Black, White]",0.0463,rank_trend,"[Asian, Hispanic, Other, Black, White]",0.0023,0.0
1584,year,num_stops,search_conducted_false_search_conducted_rate_d...,4,1665.25,0.048855,lin_reg,-1560.86,0.043965,0.99921
371,contraband_found_rate,driver_race,search_conducted_false_num_stops_dpgmm,5,[White],,rank_trend,"[Asian, White, Other, Hispanic, Black]",0.0716,0.05
560,hit_rate,driver_race,search_conducted_false_search_conducted_rate_d...,1,"[Hispanic, White, Black]",0.0984,rank_trend,"[Asian, Hispanic, Other, Black, White]",0.0023,0.55
809,num_stops,driver_race,search_conducted_true_num_stops_dpgmm,1,"[Other, Asian, Black, Hispanic, White]",0.038,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,0.0
325,contraband_found_rate,driver_race,state,VT,"[Asian, White, Hispanic, Black, Other]",0.2036,rank_trend,"[Asian, White, Other, Hispanic, Black]",0.0716,0.4
512,hit_rate,driver_gender,contraband_found_false_num_stops_dpgmm,4,[M],,rank_trend,"[F, M]",0.0433,
874,search_conducted_rate,state,search_conducted_false_contraband_found_false_...,5,"[MD, WA, AZ, SC, IL]",0.7077,rank_trend,"[CO, NC, VT, WI, MD, MA, CT, TX, SC, WA, RI, I...",0.3881,0.63
245,contraband_found_rate,driver_gender,search_conducted_false_search_conducted_true_d...,10,"[F, M]",0.0347,rank_trend,"[F, M]",0.1797,0.0


Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

In [21]:
labeled_df.trend_list[0].trend_precompute

{'rank_trend_agg_trend_contraband_found_rate_driver_gender': driver_gender
 F    0.003184
 M    0.006874
 dtype: float64,
 'rank_trend_agg_trend_contraband_found_rate_driver_race': driver_race
 Asian       0.002592
 White       0.004982
 Other       0.005700
 Hispanic    0.006455
 Black       0.008367
 dtype: float64,
 'rank_trend_agg_trend_contraband_found_rate_state': state
 NC    0.001062
 CO    0.002303
 MD    0.003503
 WA    0.004353
 CT    0.005716
 SC    0.006382
 TX    0.006541
 WI    0.007934
 IL    0.008215
 MA    0.008255
 VT    0.009567
 AZ    0.009980
 RI    0.013719
 dtype: float64,
 'rank_trend_agg_trend_hit_rate_driver_gender': driver_gender
 F    0.243620
 M    0.256954
 dtype: float64,
 'rank_trend_agg_trend_hit_rate_driver_race': driver_race
 Asian       0.165508
 Hispanic    0.193965
 Other       0.195967
 Black       0.269228
 White       0.279382
 dtype: float64,
 'rank_trend_agg_trend_hit_rate_state': state
 WA    0.135670
 NC    0.146822
 IL    0.205232
 AZ    0

# Filtering

In [22]:
help(labeled_df.get_trend_rows)

Help on method get_trend_rows in module detect_simpsons_paradox.ranking_processing:

get_trend_rows(feat1=None, feat2=None, group_feat=None, subgroup=None, trend_type=None) method of detect_simpsons_paradox.labeled_dataframe.LabeledDataFrame instance
    return a row of result_df based on the specified values. returned rows
    meet provided criteria for all columns (and operator) and any one of the listed
    values for each column (or operator)
    
    Parameters
    -----------
    feat1 : str, list, or  None
        trend variable name or None to include all
    feat2 : str, list, or  None
        trend variable name or None to include all
    group_feat : str, list, or  None
        groupoby variable name or None to include all
    subgroup : str, list, or  None
        value of groupby_feat or or None to include all



So, we can use that function to filter and look at subsets of the trends based on the features, groupby, or subgroups

In [23]:
labeled_df.get_trend_rows(feat1='year',subgroup=['Black','Hispanic'])

4  total rows meet the criteria


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance
1232,year,search_conducted_rate,driver_race,Black,-0.000555503,0.039174,lin_reg,-0.000353381,0.03903,0.000129
1233,year,search_conducted_rate,driver_race,Hispanic,-0.0014828,0.177694,lin_reg,-0.000353381,0.03903,0.000719
1556,year,num_stops,driver_race,Black,-2262.72,0.169689,lin_reg,-1560.86,0.043965,0.000127
1557,year,num_stops,driver_race,Hispanic,422.667,0.014732,lin_reg,-1560.86,0.043965,0.998086


In [24]:
labeled_df.get_trend_rows(group_feat = 'driver_race',trend_type ='lin_reg' )

15  total rows meet the criteria


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance
1231,year,search_conducted_rate,driver_race,Asian,-0.000317509,0.120567,lin_reg,-0.000353381,0.03903,2.283636e-05
1232,year,search_conducted_rate,driver_race,Black,-0.000555503,0.039174,lin_reg,-0.000353381,0.03903,0.0001286753
1233,year,search_conducted_rate,driver_race,Hispanic,-0.0014828,0.177694,lin_reg,-0.000353381,0.03903,0.0007190097
1234,year,search_conducted_rate,driver_race,Other,0.00107096,0.134696,lin_reg,-0.000353381,0.03903,0.0009067627
1235,year,search_conducted_rate,driver_race,White,-0.000402739,0.115382,lin_reg,-0.000353381,0.03903,3.142248e-05
1555,year,num_stops,driver_race,Asian,259.628,0.104919,lin_reg,-1560.86,0.043965,0.9971401
1556,year,num_stops,driver_race,Black,-2262.72,0.169689,lin_reg,-1560.86,0.043965,0.0001265133
1557,year,num_stops,driver_race,Hispanic,422.667,0.014732,lin_reg,-1560.86,0.043965,0.9980859
1558,year,num_stops,driver_race,Other,91.5127,0.07518,lin_reg,-1560.86,0.043965,0.9926358
1559,year,num_stops,driver_race,White,-5591.73,0.097383,lin_reg,-1560.86,0.043965,0.000294014


We can also filter based on SP detections with `

In [25]:
labeled_df.get_SP_rows(thresh=.2)

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2
28,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_true_d...,2,"[M, F]",0.066400,rank_trend,"[F, M]",0.251900,2.000000,True
35,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_true_d...,9,"[M, F]",0.875000,rank_trend,"[F, M]",0.251900,2.000000,True
45,search_conducted_rate,driver_gender,search_conducted_false_contraband_found_false_...,1,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000,True
59,search_conducted_rate,driver_gender,search_conducted_false_num_stops_dpgmm,5,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000,True
73,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,7,"[M, F]",0.178600,rank_trend,"[F, M]",0.251900,2.000000,True
76,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,10,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000,True
81,search_conducted_rate,driver_gender,search_conducted_true_num_stops_dpgmm,3,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000,True
82,search_conducted_rate,driver_gender,search_conducted_true_num_stops_dpgmm,4,"[M, F]",0.115900,rank_trend,"[F, M]",0.251900,2.000000,True
90,search_conducted_rate,driver_gender,contraband_found_false_num_stops_dpgmm,0,"[M, F]",0.368500,rank_trend,"[F, M]",0.251900,2.000000,True
108,search_conducted_rate,driver_race,state,CT,"[Asian, Other, White, Hispanic, Black]",0.443000,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.277200,0.400000,True


## Detection

Detection via `get_SP_rows` happens in two steps:
1. label the rows
2. filter by that column to return

Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways

In [26]:
help(labeled_df.label_SP_rows)

Help on method label_SP_rows in module detect_simpsons_paradox.ranking_processing:

label_SP_rows(filter_thresh=None) method of detect_simpsons_paradox.labeled_dataframe.LabeledDataFrame instance
    update the result_df with an additional colulmn indicateing rows with SP
    (or SP-like) as defined by sp_type
    
    Parameters
    -----------
    
    self : LabeledDataFrame
        must have values in result_df
    filter_thresh : dict or string
        dictionary of column label, threshold pairs or string name of a
        prespecified dictionary if dict, must include 'name' field (which
        will be used as the column name for storing the detections)



when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided.  Some defaults are also built in accessible by string.

In [27]:
wg.trend_quality_sp

{'agg_trend_strength': 0.15,
 'distance': 0.2,
 'name': 'default_qual_sp',
 'subgroup_trend_strength': 0.15}

Which can be applied with:

In [28]:
labeled_df.get_SP_rows('default_qual_sp')

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp
35,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_true_d...,9,"[M, F]",0.8750,rank_trend,"[F, M]",0.2519,2.00,True,True
45,search_conducted_rate,driver_gender,search_conducted_false_contraband_found_false_...,1,"[M, F]",0.8819,rank_trend,"[F, M]",0.2519,2.00,True,True
59,search_conducted_rate,driver_gender,search_conducted_false_num_stops_dpgmm,5,"[M, F]",0.8819,rank_trend,"[F, M]",0.2519,2.00,True,True
73,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,7,"[M, F]",0.1786,rank_trend,"[F, M]",0.2519,2.00,True,True
76,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,10,"[M, F]",0.8819,rank_trend,"[F, M]",0.2519,2.00,True,True
81,search_conducted_rate,driver_gender,search_conducted_true_num_stops_dpgmm,3,"[M, F]",0.8819,rank_trend,"[F, M]",0.2519,2.00,True,True
90,search_conducted_rate,driver_gender,contraband_found_false_num_stops_dpgmm,0,"[M, F]",0.3685,rank_trend,"[F, M]",0.2519,2.00,True,True
108,search_conducted_rate,driver_race,state,CT,"[Asian, Other, White, Hispanic, Black]",0.4430,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.2772,0.40,True,True
110,search_conducted_rate,driver_race,state,MA,"[Other, Asian, White, Black, Hispanic]",0.3048,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.2772,0.40,True,True
113,search_conducted_rate,driver_race,state,RI,"[Other, Asian, White, Hispanic, Black]",0.4776,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.2772,0.60,True,True


In [29]:
wg.DEFAULT_SP_DEF

{'distance': 0.0, 'name': 'SP'}

Which can be applied with:

In [30]:
labeled_df.get_SP_rows('SP')

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp,SP
28,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_true_d...,2,"[M, F]",0.066400,rank_trend,"[F, M]",0.251900,2.000000e+00,True,False,True
35,search_conducted_rate,driver_gender,search_conducted_false_search_conducted_true_d...,9,"[M, F]",0.875000,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
45,search_conducted_rate,driver_gender,search_conducted_false_contraband_found_false_...,1,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
59,search_conducted_rate,driver_gender,search_conducted_false_num_stops_dpgmm,5,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
73,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,7,"[M, F]",0.178600,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
76,search_conducted_rate,driver_gender,search_conducted_true_contraband_found_false_d...,10,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
81,search_conducted_rate,driver_gender,search_conducted_true_num_stops_dpgmm,3,"[M, F]",0.881900,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
82,search_conducted_rate,driver_gender,search_conducted_true_num_stops_dpgmm,4,"[M, F]",0.115900,rank_trend,"[F, M]",0.251900,2.000000e+00,True,False,True
90,search_conducted_rate,driver_gender,contraband_found_false_num_stops_dpgmm,0,"[M, F]",0.368500,rank_trend,"[F, M]",0.251900,2.000000e+00,True,True,True
106,search_conducted_rate,driver_race,state,AZ,"[Asian, White, Other, Hispanic, Black]",0.035100,rank_trend,"[Asian, White, Other, Black, Hispanic]",0.277200,2.000000e-01,False,False,True


We can also define our own detection filters, using any available column

In [34]:
lin_only_qual = {'name':'lin_only_qual_sp','distance':.2, 'agg_trend_strength':.05,
                'subgroup_trend_strength':.15,'trend_type':'lin_reg'}
labeled_df.get_SP_rows(lin_only_qual,replace=True) 

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp
1873,search_conducted_rate,num_stops,state,TX,4257870.0,0.173388,lin_reg,-257201,0.065594,0.999997,True,False,True,True
1879,search_conducted_rate,num_stops,driver_race,Asian,242572.0,0.258151,lin_reg,-257201,0.065594,0.999995,True,False,True,True
1894,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,2,1629290.0,0.245048,lin_reg,-257201,0.065594,0.999997,True,False,True,True
1895,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,3,39402400.0,0.418567,lin_reg,-257201,0.065594,0.999998,True,False,True,True
1896,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,4,24299100.0,0.508525,lin_reg,-257201,0.065594,0.999997,True,False,True,True
1900,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,8,433785.0,0.160242,lin_reg,-257201,0.065594,0.999996,True,False,True,True
1901,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,9,32733300.0,0.772383,lin_reg,-257201,0.065594,0.999998,True,False,True,True
1903,search_conducted_rate,num_stops,search_conducted_false_search_conducted_true_d...,11,336447000.0,1.0,lin_reg,-257201,0.065594,0.999998,True,False,True,True
1906,search_conducted_rate,num_stops,search_conducted_false_search_conducted_rate_d...,2,24031000.0,0.887903,lin_reg,-257201,0.065594,0.999997,True,False,True,True
1911,search_conducted_rate,num_stops,search_conducted_false_contraband_found_false_...,1,28630800.0,0.818592,lin_reg,-257201,0.065594,0.999998,True,False,True,True


# Ranking

In [35]:
labeled_df.rank_occurences_by_view(ascending=False).head(20)

Index(['feat1', 'feat2', 'group_feat', 'subgroup', 'subgroup_trend',
       'subgroup_trend_strength', 'trend_type', 'agg_trend',
       'agg_trend_strength', 'distance', 'SP_thresh0.2', 'default_qual_sp',
       'SP', 'lin_only_qual_sp', 'mean_view_distance'],
      dtype='object')


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance
834,num_stops,driver_race,num_stopsquantiles,low,"[White, Other, Black, Hispanic, Asian]",0.1556,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.2,True,True,True,False,0.49233
757,num_stops,driver_race,search_conducted_false_search_conducted_true_d...,1,"[White, Hispanic]",0.1786,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233
760,num_stops,driver_race,search_conducted_false_search_conducted_true_d...,4,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233
772,num_stops,driver_race,search_conducted_false_search_conducted_rate_d...,4,"[White, Hispanic]",0.2235,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233
785,num_stops,driver_race,search_conducted_false_num_stops_dpgmm,1,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233
794,num_stops,driver_race,search_conducted_false_num_stops_dpgmm,10,"[White, Hispanic]",0.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233
798,num_stops,driver_race,search_conducted_true_contraband_found_false_d...,2,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233
803,num_stops,driver_race,search_conducted_true_contraband_found_false_d...,7,"[White, Hispanic]",0.0071,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233
812,num_stops,driver_race,search_conducted_true_num_stops_dpgmm,4,"[White, Hispanic]",0.0035,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233
816,num_stops,driver_race,search_conducted_true_num_stops_dpgmm,8,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233


In [36]:
labeled_df.add_view_score('SP_thresh0.2',agg_type='sum',colored=False).head(10)

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
0,num_stops,driver_race,num_stopsquantiles,low,"[White, Other, Black, Hispanic, Asian]",0.1556,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.2,True,True,True,False,0.49233,73.0
1,num_stops,driver_race,search_conducted_false_search_conducted_true_d...,1,"[White, Hispanic]",0.1786,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0
2,num_stops,driver_race,search_conducted_false_search_conducted_true_d...,4,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0
3,num_stops,driver_race,search_conducted_false_search_conducted_rate_d...,4,"[White, Hispanic]",0.2235,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0
4,num_stops,driver_race,search_conducted_false_num_stops_dpgmm,1,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0
5,num_stops,driver_race,search_conducted_false_num_stops_dpgmm,10,"[White, Hispanic]",0.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233,73.0
6,num_stops,driver_race,search_conducted_true_contraband_found_false_d...,2,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0
7,num_stops,driver_race,search_conducted_true_contraband_found_false_d...,7,"[White, Hispanic]",0.0071,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233,73.0
8,num_stops,driver_race,search_conducted_true_num_stops_dpgmm,4,"[White, Hispanic]",0.0035,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,False,True,False,0.49233,73.0
9,num_stops,driver_race,search_conducted_true_num_stops_dpgmm,8,"[White, Hispanic]",1.0,rank_trend,"[Other, Asian, Black, Hispanic, White]",0.4996,1.0,True,True,True,False,0.49233,73.0


In [38]:
labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2','SP_thresh0.2').head()

Index(['feat1', 'feat2', 'group_feat', 'subgroup', 'subgroup_trend',
       'subgroup_trend_strength', 'trend_type', 'agg_trend',
       'agg_trend_strength', 'distance', 'SP_thresh0.2', 'default_qual_sp',
       'SP', 'lin_only_qual_sp', 'mean_view_distance',
       'sum_view_SP_thresh0.2'],
      dtype='object')


Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trend_quality,trend_type,agg_trend,agg_trend_quality,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
103,num_stops,state,search_conducted_false_contraband_found_false_...,9,"[AZ, IL, WA, CT, RI, SC, VT, TX, MD, MA, CO, N...",0.01,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209,1.0,True,False,True,False,0.483368,88.0
104,num_stops,state,search_conducted_true_num_stops_dpgmm,9,"[CT, AZ, RI, SC, IL, WA, TX, MD, VT, MA, CO, N...",0.1541,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209,0.97,True,True,True,False,0.483368,88.0
105,num_stops,state,hit_ratequantiles,high,"[AZ, IL, CT, SC, NC, TX, RI, VT, MD, WI, CO, MA]",0.3503,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209,0.96,True,True,True,False,0.483368,88.0
106,num_stops,state,search_conducted_false_search_conducted_true_d...,10,"[AZ, CT, RI, IL, WA, SC, TX, MD, VT, MA, WI, C...",0.2228,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209,0.9,True,True,True,False,0.483368,88.0
107,num_stops,state,num_stopsquantiles,mid,"[WI, CT, RI, SC, MA, TX, CO, AZ, NC, WA, MD, I...",0.0469,rank_trend,"[RI, VT, WI, CT, MD, MA, AZ, CO, IL, NC, WA, S...",0.3209,0.87,True,False,True,False,0.483368,88.0
