In [1]:
import pandas as pd
import os
import wiggum as wg
import numpy as np

We'll first load in some data, this has both regression and rate type trends, since this file has a weird index, we'll load it in as dataframe first

In [2]:
hit_search_rate = pd.read_csv('../data/state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI.csv',index_col='Unnamed: 0')
hit_search_rate.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,search_conducted_rate,contraband_found_false,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops
0,AZ,2009,F,White,0,0,0,1,30.0,1.0,0.032258,31.0,,,1.0,,,31.0
1,AZ,2009,M,Black,0,0,0,1,1.0,2.0,0.666667,2.0,1.0,0.333333,1.0,1.0,0.5,3.0
2,AZ,2009,M,Hispanic,0,0,0,1,14.0,1.0,0.066667,15.0,,,1.0,,,15.0
3,AZ,2009,M,White,0,0,0,1,32.0,4.0,0.111111,34.0,2.0,0.055556,2.0,2.0,0.5,36.0
4,AZ,2010,F,Asian,0,1,0,0,58.0,3.0,0.04918,61.0,,,3.0,,,61.0


We can now use the LabeledDataFrame with that DataFrame to create the object

In [3]:
labeled_df_setup = wg.LabeledDataFrame(hit_search_rate)

Next, we can infer the variable types

In [4]:
labeled_df_setup.infer_var_types()
labeled_df_setup.meta_df.head()

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,,,
year,int64,ordinal,,,
driver_gender,object,binary,,,
driver_race,object,categorical,,,
decriminalization,int64,binary,,,


For this, we'll manually set these, but in the vizualization tool you can also set these with drop down menus

In [5]:
roles = {'state':['independent','groupby'], 'year':'independent', 'driver_gender':['trend','groupby'], 
         'driver_race':['independent','groupby'],
       'decriminalization':['groupby'], 'medical':['groupby'],
         'recreational':['groupby'], 'no_reforms':['groupby'],
       'search_conducted_false':'ignore', 'search_conducted_true':'ignore',
       'search_conducted_rate':'dependent', 'contraband_found_false':'ignore',
       'contraband_found_true':'ignore', 'contraband_found_rate':'dependent', 'hit_false':'ignore',
       'hit_true':'ignore', 'hit_rate':'dependent', 'num_stops':'dependent'}
is_count = {'state':False, 'year':False, 'driver_gender':False, 'driver_race':False,
       'decriminalization':False, 'medical':False, 'recreational':False, 'no_reforms':False,
       'search_conducted_false':True, 'search_conducted_true':True,
       'search_conducted_rate':False, 'contraband_found_false':True,
       'contraband_found_true':True, 'contraband_found_rate':False, 'hit_false':True,
       'hit_true':True, 'hit_rate':False, 'num_stops':True}
count_list = ['search_conducted_false', 'search_conducted_true','contraband_found_false',
       'contraband_found_true', 'hit_false',
       'hit_true', 'num_stops']
var_types = {'driver_gender':'categorical','decriminalization':'categorical', 
             'medical':'categorical', 'recreational':'categorical', 'no_reforms':False,}
weighting = {'hit_rate':'search_conducted_true','search_conducted_rate':'num_stops',
             'contraband_found_rate':'num_stops'}

We'll set those next.  Above gives examples of two ways that we can specify the count values to pass them to the set_counts function, but we'll only call it once below. 

In [6]:
labeled_df_setup.set_counts(count_list)
labeled_df_setup.set_roles(roles)
labeled_df_setup.set_weighting_vars(weighting)
labeled_df_setup.set_var_types(var_types)
labeled_df_setup.meta_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.meta_df['role'][k] = v
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.meta_df['var_type'][k] = v


Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[independent, groupby]",False,
year,int64,ordinal,independent,False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[independent, groupby]",False,
decriminalization,int64,categorical,[groupby],False,
medical,int64,categorical,[groupby],False,
recreational,int64,categorical,[groupby],False,
no_reforms,int64,False,[groupby],False,
search_conducted_false,float64,continuous,ignore,True,
search_conducted_true,float64,continuous,ignore,True,


Now, we've set this up, we can also save these configurations to load them in directly in the future

In [7]:
labeled_df_setup.to_csvs('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')

True

We can see what this does, using a bash magic

In [8]:
%%bash
cd ../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI
ls

df.csv
meta.csv
result_df.csv


it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. 

Now, we can can also load the data back

In [9]:
labeled_df = wg.LabeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[independent, splitby]",False,
year,int64,ordinal,[independent],False,
driver_gender,object,categorical,"[[independent, dependent], splitby]",False,
driver_race,object,categorical,"[independent, splitby]",False,
decriminalization,int64,categorical,[splitby],False,
medical,int64,categorical,[splitby],False,
recreational,int64,categorical,[splitby],False,
no_reforms,int64,False,[splitby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [10]:
labeled_df.add_intersectional(['driver_gender','driver_race'])
labeled_df.df.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,search_conducted_rate,contraband_found_false,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops,driver_gender_driver_race
,,,,,,,,,,,,,,,,,,,
0.0,AZ,2009.0,F,White,0.0,0.0,0.0,1.0,30.0,1.0,0.032258,31.0,,,1.0,,,31.0,F_White
1.0,AZ,2009.0,M,Black,0.0,0.0,0.0,1.0,1.0,2.0,0.666667,2.0,1.0,0.333333,1.0,1.0,0.5,3.0,M_Black
2.0,AZ,2009.0,M,Hispanic,0.0,0.0,0.0,1.0,14.0,1.0,0.066667,15.0,,,1.0,,,15.0,M_Hispanic
3.0,AZ,2009.0,M,White,0.0,0.0,0.0,1.0,32.0,4.0,0.111111,34.0,2.0,0.055556,2.0,2.0,0.5,36.0,M_White
4.0,AZ,2010.0,F,Asian,0.0,1.0,0.0,0.0,58.0,3.0,0.04918,61.0,,,3.0,,,61.0,F_Asian


We can also pass more variables and lengths of the tuples that we want it to combine.  For example if we provide 3 categorical variables we can do both pairs and triples of the variables.

In [11]:
labeled_df.add_intersectional(['driver_gender','driver_race','state'],[2,3])
labeled_df.df.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops,driver_gender_driver_race,driver_gender_state,driver_race_state,driver_gender_driver_race_state
,,,,,,,,,,,,,,,,,,,,,
0.0,AZ,2009.0,F,White,0.0,0.0,0.0,1.0,30.0,1.0,...,,,1.0,,,31.0,F_White,F_AZ,White_AZ,F_White_AZ
1.0,AZ,2009.0,M,Black,0.0,0.0,0.0,1.0,1.0,2.0,...,1.0,0.333333,1.0,1.0,0.5,3.0,M_Black,M_AZ,Black_AZ,M_Black_AZ
2.0,AZ,2009.0,M,Hispanic,0.0,0.0,0.0,1.0,14.0,1.0,...,,,1.0,,,15.0,M_Hispanic,M_AZ,Hispanic_AZ,M_Hispanic_AZ
3.0,AZ,2009.0,M,White,0.0,0.0,0.0,1.0,32.0,4.0,...,2.0,0.055556,2.0,2.0,0.5,36.0,M_White,M_AZ,White_AZ,M_White_AZ
4.0,AZ,2010.0,F,Asian,0.0,1.0,0.0,0.0,58.0,3.0,...,,,3.0,,,61.0,F_Asian,F_AZ,Asian_AZ,F_Asian_AZ


In [12]:
labeled_df.add_all_dpgmm(qual_thresh =.2)
labeled_df.df.head()



Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,contraband_found_true,contraband_found_rate,hit_false,hit_true,hit_rate,num_stops,driver_gender_driver_race,driver_gender_state,driver_race_state,driver_gender_driver_race_state
,,,,,,,,,,,,,,,,,,,,,
0.0,AZ,2009.0,F,White,0.0,0.0,0.0,1.0,30.0,1.0,...,,,1.0,,,31.0,F_White,F_AZ,White_AZ,F_White_AZ
1.0,AZ,2009.0,M,Black,0.0,0.0,0.0,1.0,1.0,2.0,...,1.0,0.333333,1.0,1.0,0.5,3.0,M_Black,M_AZ,Black_AZ,M_Black_AZ
2.0,AZ,2009.0,M,Hispanic,0.0,0.0,0.0,1.0,14.0,1.0,...,,,1.0,,,15.0,M_Hispanic,M_AZ,Hispanic_AZ,M_Hispanic_AZ
3.0,AZ,2009.0,M,White,0.0,0.0,0.0,1.0,32.0,4.0,...,2.0,0.055556,2.0,2.0,0.5,36.0,M_White,M_AZ,White_AZ,M_White_AZ
4.0,AZ,2010.0,F,Asian,0.0,1.0,0.0,0.0,58.0,3.0,...,,,3.0,,,61.0,F_Asian,F_AZ,Asian_AZ,F_Asian_AZ


In [13]:
labeled_df.add_quantile(['hit_rate','num_stops'])
labeled_df.df.head()

Unnamed: 0,state,year,driver_gender,driver_race,decriminalization,medical,recreational,no_reforms,search_conducted_false,search_conducted_true,...,hit_false,hit_true,hit_rate,num_stops,driver_gender_driver_race,driver_gender_state,driver_race_state,driver_gender_driver_race_state,hit_ratequantiles,num_stopsquantiles
,,,,,,,,,,,,,,,,,,,,,
0.0,AZ,2009.0,F,White,0.0,0.0,0.0,1.0,30.0,1.0,...,1.0,,,31.0,F_White,F_AZ,White_AZ,F_White_AZ,high,low
1.0,AZ,2009.0,M,Black,0.0,0.0,0.0,1.0,1.0,2.0,...,1.0,1.0,0.5,3.0,M_Black,M_AZ,Black_AZ,M_Black_AZ,high,low
2.0,AZ,2009.0,M,Hispanic,0.0,0.0,0.0,1.0,14.0,1.0,...,1.0,,,15.0,M_Hispanic,M_AZ,Hispanic_AZ,M_Hispanic_AZ,high,low
3.0,AZ,2009.0,M,White,0.0,0.0,0.0,1.0,32.0,4.0,...,2.0,2.0,0.5,36.0,M_White,M_AZ,White_AZ,M_White_AZ,high,low
4.0,AZ,2010.0,F,Asian,0.0,1.0,0.0,0.0,58.0,3.0,...,3.0,,,61.0,F_Asian,F_AZ,Asian_AZ,F_Asian_AZ,high,low


In [14]:
labeled_df.get_vars_per_type('categorical')

['state',
 'driver_gender',
 'driver_race',
 'decriminalization',
 'medical',
 'recreational',
 'driver_gender_driver_race',
 'driver_gender_state',
 'driver_race_state',
 'driver_gender_driver_race_state',
 'hit_ratequantiles',
 'num_stopsquantiles']

In [15]:
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[independent, splitby]",False,
year,int64,ordinal,[independent],False,
driver_gender,object,categorical,"[[independent, dependent], splitby]",False,
driver_race,object,categorical,"[independent, splitby]",False,
decriminalization,int64,categorical,[splitby],False,
medical,int64,categorical,[splitby],False,
recreational,int64,categorical,[splitby],False,
no_reforms,int64,False,[splitby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


# Using Trends

Trend objects define their name, how to compute the trend and how to choose which variables, 

extension will allow that the var lists may be passed to reduce which ones are computed

In [16]:
corrobj = wg.All_Pearson()
corrobj.get_trend_vars(labeled_df)
corrobj.regression_vars

[('year', 'search_conducted_rate'),
 ('year', 'contraband_found_rate'),
 ('year', 'hit_rate'),
 ('year', 'num_stops')]

In [17]:
rankobj = wg.Mean_Rank_Trend()
linreg_obj = wg.All_Linear_Trend()

# Computing Trends on a LabeledDataFrame

There are two ways, we can use default setting and pass the names of the trend type or a trend object

In [18]:
labeled_df.get_subgroup_trends_1lev(['pearson_corr'])
labeled_df.result_df.head()

Unnamed: 0,dependent,group_feat,independent,subgroup,subgroup_trend,subgroup_trend_strength,trend_type,agg_trend,agg_trend_strength,comparison_type
0,search_conducted_rate,state,year,AZ,-0.247018,0.247018,pearson_corr,-0.03903,0.03903,aggregate-subgroup
1,search_conducted_rate,state,year,CO,-0.414566,0.414566,pearson_corr,-0.03903,0.03903,aggregate-subgroup
2,search_conducted_rate,state,year,CT,0.118238,0.118238,pearson_corr,-0.03903,0.03903,aggregate-subgroup
3,search_conducted_rate,state,year,IL,-0.199765,0.199765,pearson_corr,-0.03903,0.03903,aggregate-subgroup
4,search_conducted_rate,state,year,MA,-0.603026,0.603026,pearson_corr,-0.03903,0.03903,aggregate-subgroup


Now we can use a list of objects and apply multiple trends

In [19]:
labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  slope = r_num / ssxm
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup_trend,subgroup_trend_strength,trend_type
640,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_race_state,year,White_CO,-0.37966,0.37966,pearson_corr
2269,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_gender_driver_race_state,year,M_White_CT,-5.30053,0.00083,lin_reg
2188,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_gender_driver_race_state,year,F_Hispanic_WA,2.95902,0.00632,lin_reg
1685,"[WA, NC, IL, AZ, SC, MD, TX, CT, RI, MA, WI, C...",0.2193,aggregate-subgroup,hit_rate,driver_gender_driver_race_state,state,F_Other_NC,[NC],,rank_trend
242,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,driver_gender_driver_race_state,year,M_Other_WI,-0.349961,0.349961,pearson_corr
208,"[Asian, White, Other, Black, Hispanic]",0.2772,aggregate-subgroup,search_conducted_rate,driver_gender_driver_race_state,driver_race,M_Black_TX,[Black],,rank_trend
105,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,driver_race_state,year,Other_IL,-0.292019,0.292019,pearson_corr
243,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,driver_gender_driver_race_state,year,M_White_AZ,-0.619054,0.619054,pearson_corr
784,"[Other, Asian, Black, Hispanic, White]",0.4996,aggregate-subgroup,num_stops,driver_gender,driver_race,F,"[Other, Asian, Hispanic, Black, White]",0.421,rank_trend
2941,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_gender_driver_race_state,year,F_Asian_IL,-22.1643,0.297131,lin_reg


These two methods give the same, the string based version allows for simple access to default setting but passing a trend object would allow for overriding defaults and creating more custom subests of trends.

We can see what types of trends were computed from `result_df`

In [20]:
pd.unique(labeled_df.result_df['trend_type'])

array(['pearson_corr', 'rank_trend', 'lin_reg'], dtype=object)

In [21]:
pd.unique(labeled_df.result_df['comparison_type'])

array(['aggregate-subgroup'], dtype=object)

We can also add trends that are structured for pairwise comparisons

In [22]:
labeled_df.get_pairwise_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  slope = r_num / ssxm
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  sterrest = np.sqrt((1 - r**2) * ssym / ssxm / df)
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw])
  b, slope = np.polyfit(df[i],df[d],1, w = df[dw]

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type
866115,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,year,F_White_MA,M_Asian_IL,2.63262,2.41858,0.009045,0.0134,lin_reg
554804,,,pairwise,hit_rate,driver_gender_driver_race_state,state,F_White_MA,M_Black_SC,[MA],[SC],,,rank_trend
287375,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,state,F_Hispanic_VT,F_Other_TX,[VT],[TX],,,rank_trend
206281,,,pairwise,hit_rate,driver_race_state,driver_race,Black_AZ,Other_IL,[Black],[Other],,,rank_trend
458954,,,pairwise,contraband_found_rate,driver_gender_driver_race_state,driver_race,M_Asian_CO,M_Other_MA,[Asian],[Other],,,rank_trend
679724,,,pairwise,num_stops,driver_gender_driver_race_state,state,F_Hispanic_NC,M_Black_AZ,[NC],[AZ],,,rank_trend
266063,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,state,M_Black_IL,M_Other_TX,[IL],[TX],,,rank_trend
660668,,,pairwise,num_stops,driver_gender_driver_race_state,state,M_Black_TX,M_Hispanic_IL,[TX],[IL],,,rank_trend
705235,,,pairwise,num_stops,driver_gender_driver_race_state,driver_race,F_Asian_IL,M_White_AZ,[Asian],[White],,,rank_trend
107891,,,pairwise,hit_rate,driver_gender_state,driver_race,F_CT,M_NC,"[Asian, Other, Black, Hispanic, White]","[Hispanic, Asian, Black, White, Other]",0.3745,0.0181,rank_trend


In [23]:
pd.unique(labeled_df.result_df['comparison_type'])

array(['aggregate-subgroup', 'pairwise'], dtype=object)

The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

In [24]:
labeled_df.trend_list

[<wiggum.trends.All_Pearson at 0x7f3aa62c8ef0>,
 <wiggum.trends.Mean_Rank_Trend at 0x7f3aa62c8898>,
 <wiggum.trends.All_Linear_Trend at 0x7f3aa62c8e48>,
 <wiggum.trends.Mean_Rank_Trend at 0x7f3aa62c8898>,
 <wiggum.trends.All_Linear_Trend at 0x7f3aa62c8e48>]

In [25]:
labeled_df.result_df.head()

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type
0,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,AZ,,-0.247018,,0.247018,,pearson_corr
1,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,CO,,-0.414566,,0.414566,,pearson_corr
2,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,CT,,0.118238,,0.118238,,pearson_corr
3,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,IL,,-0.199765,,0.199765,,pearson_corr
4,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,MA,,-0.603026,,0.603026,,pearson_corr


In [26]:
# a = ['a','b','c','d']
# b = ['a','b','d','e']
a = ['F', 'M']
b = ['F', 'M']
for i, d in enumerate(set(a+b)):
    print (i,':',d)

0 : M
1 : F


In [27]:
# labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
labeled_df.add_distance(row_wise=True) #('subgroup_trend','subgroup_trend2')

labeled_df.result_df.sample(10)

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance
745005,,,pairwise,num_stops,driver_gender_driver_race_state,driver_race,F_Hispanic_MD,M_Black_CO,[Hispanic],[Black],,,rank_trend,
350538,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,driver_race,F_Hispanic_AZ,M_Other_MA,[Hispanic],[Other],,,rank_trend,
43718,,,pairwise,contraband_found_rate,driver_gender_driver_race_state,year,F_White_IL,M_Black_CT,0.299625,0.417045,0.299625,0.417045,pearson_corr,0.0
70501,,,pairwise,num_stops,driver_gender_driver_race_state,year,F_Asian_RI,M_Other_RI,-0.295944,-0.147513,0.295944,0.147513,pearson_corr,0.0
352279,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,driver_race,F_Hispanic_RI,F_White_MA,[Hispanic],[White],,,rank_trend,
580624,,,pairwise,hit_rate,driver_gender_driver_race_state,driver_race,F_Hispanic_IL,F_Other_VT,[Hispanic],[Other],,,rank_trend,
891753,,,pairwise,search_conducted_rate,driver_gender_driver_race_state,year,F_Hispanic_CT,M_Other_IL,-5.82913,0.794513,0.00079,0.007625,lin_reg,0.319257
614339,,,pairwise,hit_rate,driver_gender_driver_race_state,driver_race,F_Hispanic_RI,M_Hispanic_VT,[Hispanic],[Hispanic],,,rank_trend,
574216,,,pairwise,hit_rate,driver_gender_driver_race_state,driver_race,F_Asian_IL,F_Other_CT,[Asian],[Other],,,rank_trend,
500823,,,pairwise,contraband_found_rate,driver_gender_driver_race_state,driver_race,M_Hispanic_WA,M_White_MA,[Hispanic],[White],,,rank_trend,


Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

In [28]:
labeled_df.trend_list[0].trend_precompute

{'pearson_corr_agg_trend':                        contraband_found_rate  num_stops  hit_rate  \
 contraband_found_rate               1.000000  -0.073456  0.275397   
 num_stops                          -0.073456   1.000000 -0.042102   
 hit_rate                            0.275397  -0.042102  1.000000   
 search_conducted_rate               0.807915  -0.065594 -0.094273   
 year                                0.041129  -0.043965  0.316196   
 
                        search_conducted_rate      year  
 contraband_found_rate               0.807915  0.041129  
 num_stops                          -0.065594 -0.043965  
 hit_rate                           -0.094273  0.316196  
 search_conducted_rate               1.000000 -0.039030  
 year                               -0.039030  1.000000  ,
 'pearson_corr_subgroup_trend':                                           contraband_found_rate  num_stops  \
 num_stopsquantiles                                                           
 high         

# Saving with trends

In [29]:
labeled_df.save_all('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI_all')

True

In [30]:
%%bash
cd ../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI_all
ls

df.csv
meta.csv
result_df.csv
trends.json


In [31]:
labeled_df = wg.LabeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI_all')

  if self.run_code(code, result):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['trend_type'] = tt


In [32]:
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[independent, splitby]",False,
year,int64,ordinal,[independent],False,
driver_gender,object,categorical,"[independent, dependent, splitby]",False,
driver_race,object,categorical,"[independent, splitby]",False,
decriminalization,int64,categorical,[splitby],False,
medical,int64,categorical,[splitby],False,
recreational,int64,categorical,[splitby],False,
no_reforms,int64,False,[splitby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [33]:
labeled_df.trend_list[0].trend_precompute

{'pearson_corr_agg_trend':    contraband_found_rate  num_stops  hit_rate  search_conducted_rate      year
 0               1.000000  -0.073456  0.275397               0.807915  0.041129
 1              -0.073456   1.000000 -0.042102              -0.065594 -0.043965
 2               0.275397  -0.042102  1.000000              -0.094273  0.316196
 3               0.807915  -0.065594 -0.094273               1.000000 -0.039030
 4               0.041129  -0.043965  0.316196              -0.039030  1.000000,
 'pearson_corr_subgroup_trend':     contraband_found_rate  num_stops  hit_rate  search_conducted_rate  \
 0                1.000000   0.107314  0.255356               0.729171   
 1                0.107314   1.000000  0.002882               0.047615   
 2                0.255356   0.002882  1.000000              -0.233576   
 3                0.729171   0.047615 -0.233576               1.000000   
 4                0.066080  -0.032069  0.258733               0.011983   
 5                

# Filtering

In [34]:
help(labeled_df.get_trend_rows)

Help on method get_trend_rows in module wiggum.ranking_processing:

get_trend_rows(independent=None, dependent=None, group_feat=None, subgroup=None, subgroup2=None, trend_type=None, comparison_type=None, inplace=False, index=False) method of wiggum.labeled_dataframe.LabeledDataFrame instance
    return a row of result_df based on the specified values. returned rows
    meet provided criteria for all columns (and operator) and any one of the listed
    values for each column (or operator)
    
    Parameters
    -----------
    indep : str, list, or  {None}
        trend variable name or None to include all
    dependent : str, list, or  {None}
        trend variable name or None to include all
    group_feat : str, list, or  {None}
        groupoby variable name or None to include all
    subgroup : str, list, or  {None}
        value of groupby_feat or  None to include all
    subgroup2 : str, list or {None}
        value of groupby_feat or  None to include all
    trend_type: str, li

So, we can use that function to filter and look at subsets of the trends based on the features, groupby, or subgroups

In [35]:
labeled_df.get_trend_rows(independent='year',subgroup=['Black','Hispanic'])

72  total rows meet the criteria


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance
3060,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Black,,1.24021,,0.001744,,lin_reg,0.132212
3061,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Hispanic,,5.34096,,0.017419,,lin_reg,0.181954
3322,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Black,,-2262.72,,0.169689,,lin_reg,0.000127
3323,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Hispanic,,422.667,,0.014732,,lin_reg,0.998086
381751,,,pairwise,search_conducted_rate,driver_race,year,Black,Hispanic,1.24021,5.34096,0.001744,0.017419,lin_reg,0.314165
381752,,,pairwise,search_conducted_rate,driver_race,year,Black,Other,1.24021,-7.23536,0.001744,0.032087,lin_reg,0.480570
381753,,,pairwise,search_conducted_rate,driver_race,year,Black,White,1.24021,1.9543,0.001744,0.003678,lin_reg,0.130903
381754,,,pairwise,search_conducted_rate,driver_race,year,Black,Hispanic,1.24021,5.34096,0.001744,0.017419,lin_reg,0.314165
381755,,,pairwise,search_conducted_rate,driver_race,year,Black,Other,1.24021,-7.23536,0.001744,0.032087,lin_reg,0.480570
381756,,,pairwise,search_conducted_rate,driver_race,year,Black,White,1.24021,1.9543,0.001744,0.003678,lin_reg,0.130903


In [36]:
labeled_df.get_trend_rows(group_feat = 'driver_race',trend_type ='lin_reg' )

90  total rows meet the criteria


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance
3059,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Asian,,1.24187,,0.003783,,lin_reg,0.131795
3060,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Black,,1.24021,,0.001744,,lin_reg,0.132212
3061,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Hispanic,,5.34096,,0.017419,,lin_reg,0.181954
3062,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Other,,-7.23536,,0.032087,,lin_reg,0.612781
3063,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,White,,1.9543,,0.003678,,lin_reg,0.001309
3321,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Asian,,259.628,,0.104919,,lin_reg,0.997140
3322,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Black,,-2262.72,,0.169689,,lin_reg,0.000127
3323,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Hispanic,,422.667,,0.014732,,lin_reg,0.998086
3324,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,Other,,91.5127,,0.075180,,lin_reg,0.992636
3325,-1560.86,0.043965,aggregate-subgroup,num_stops,driver_race,year,White,,-5591.73,,0.097383,,lin_reg,0.000294


In [37]:
labeled_df.result_df.columns

Index(['agg_trend', 'agg_trend_strength', 'comparison_type', 'dependent',
       'group_feat', 'independent', 'subgroup', 'subgroup2', 'subgroup_trend',
       'subgroup_trend2', 'subgroup_trend_strength',
       'subgroup_trend_strength2', 'trend_type', 'distance'],
      dtype='object')

We can also filter based on SP detections with `

In [38]:
labeled_df.get_SP_rows(thresh=.2)

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2
3044,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,AZ,,0.0220027,,0.001366,,lin_reg,0.686210,True
3046,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,CT,,-5.2563,,0.000831,,lin_reg,0.580529,True
3049,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,MD,,6.78788,,0.016261,,lin_reg,0.206667,True
3050,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,NC,,0.652098,,0.007841,,lin_reg,0.332345,True
3052,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,SC,,-0.220005,,0.000837,,lin_reg,0.838078,True
3054,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,VT,,-1.55738,,0.002015,,lin_reg,0.336829,True
3056,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,WI,,0.60121,,0.000842,,lin_reg,0.355607,True
3057,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_gender,year,F,,0.49675,,0.001322,,lin_reg,0.406705,True
3062,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,driver_race,year,Other,,-7.23536,,0.032087,,lin_reg,0.612781,True
3064,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,decriminalization,year,0,,0.740524,,0.005495,,lin_reg,0.294428,True


In [39]:
labeled_df.get_trend_rows(trend_type = 'lin_reg')

114959  total rows meet the criteria


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2
3044,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,AZ,,0.0220027,,0.001366,,lin_reg,0.686210,True
3045,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,CO,,1.61212,,0.002736,,lin_reg,0.053675,False
3046,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,CT,,-5.2563,,0.000831,,lin_reg,0.580529,True
3047,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,IL,,1.9328,,0.013265,,lin_reg,0.004174,False
3048,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,MA,,4.37932,,0.015387,,lin_reg,0.156866,False
3049,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,MD,,6.78788,,0.016261,,lin_reg,0.206667,True
3050,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,NC,,0.652098,,0.007841,,lin_reg,0.332345,True
3051,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,RI,,2.79571,,0.015540,,lin_reg,0.081100,False
3052,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,SC,,-0.220005,,0.000837,,lin_reg,0.838078,True
3053,1.96425,0.004473,aggregate-subgroup,search_conducted_rate,state,year,TX,,3.76278,,0.014346,,lin_reg,0.134419,False


In [40]:
type(np.NaN)

float

## Detection

Detection via `get_SP_rows` happens in two steps:
1. label the rows
2. filter by that column to return

Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways

In [41]:
help(labeled_df.label_SP_rows)

Help on method label_SP_rows in module wiggum.ranking_processing:

label_SP_rows(filter_thresh=None) method of wiggum.labeled_dataframe.LabeledDataFrame instance
    update the result_df with an additional colulmn indicateing rows with SP
    (or SP-like) as defined by sp_type
    
    Parameters
    -----------
    
    self : LabeledDataFrame
        must have values in result_df
    filter_thresh : dict or string
        dictionary of column label, threshold pairs or string name of a
        prespecified dictionary if dict, must include 'name' field (which
        will be used as the column name for storing the detections)



when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided.  Some defaults are also built in accessible by string.

In [42]:
wg.trend_quality_sp

{'agg_trend_strength': 0.15,
 'distance': 0.2,
 'name': 'default_qual_sp',
 'subgroup_trend_strength': 0.15}

Which can be applied with:

In [43]:
labeled_df.get_SP_rows('default_qual_sp')

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp
520,0.316196,0.316196,aggregate-subgroup,hit_rate,state,year,SC,,-0.255096,,0.255096,,pearson_corr,1.0000,True,True
523,0.316196,0.316196,aggregate-subgroup,hit_rate,state,year,WA,,-0.68403,,0.684030,,pearson_corr,1.0000,True,True
552,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,F_CT,,-0.262423,,0.262423,,pearson_corr,1.0000,True,True
558,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,F_SC,,-0.253289,,0.253289,,pearson_corr,1.0000,True,True
561,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,F_WA,,-0.646752,,0.646752,,pearson_corr,1.0000,True,True
563,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,M_AZ,,-0.15601,,0.156010,,pearson_corr,1.0000,True,True
571,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,M_SC,,-0.264436,,0.264436,,pearson_corr,1.0000,True,True
574,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_gender_state,year,M_WA,,-0.768069,,0.768069,,pearson_corr,1.0000,True,True
586,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_race_state,year,Asian_WA,,-0.723905,,0.723905,,pearson_corr,1.0000,True,True
587,0.316196,0.316196,aggregate-subgroup,hit_rate,driver_race_state,year,Asian_WI,,-0.317996,,0.317996,,pearson_corr,1.0000,True,True


In [44]:
wg.DEFAULT_SP_DEF

{'comparison_type': 'aggregate', 'distance': 0.0, 'name': 'SP'}

Which can be applied with:

In [45]:
labeled_df.get_SP_rows('SP')

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp,SP


We can also define our own detection filters, using any available column

In [46]:
lin_only_qual = {'name':'lin_only_qual_sp','distance':.2, 'agg_trend_strength':.05,
                'subgroup_trend_strength':.15,'trend_type':'lin_reg'}
labeled_df.get_SP_rows(lin_only_qual,replace=True) 

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp


# Ranking

In [47]:
labeled_df.rank_occurences_by_view(ascending=False).head(20)

Index(['agg_trend', 'agg_trend_strength', 'comparison_type', 'dependent',
       'group_feat', 'independent', 'subgroup', 'subgroup2', 'subgroup_trend',
       'subgroup_trend2', 'subgroup_trend_strength',
       'subgroup_trend_strength2', 'trend_type', 'distance', 'SP_thresh0.2',
       'default_qual_sp', 'SP', 'lin_only_qual_sp', 'mean_view_distance'],
      dtype='object')


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance
93468,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,AZ,,0.301916,,0.301916,,pearson_corr,1.0,True,False,False,False,0.499594
93472,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,MA,,0.115809,,0.115809,,pearson_corr,1.0,True,False,False,False,0.499594
93474,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,NC,,0.117245,,0.117245,,pearson_corr,1.0,True,False,False,False,0.499594
93475,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,RI,,0.0583105,,0.058311,,pearson_corr,1.0,True,False,False,False,0.499594
93478,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,VT,,0.0444106,,0.044411,,pearson_corr,1.0,True,False,False,False,0.499594
93483,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Asian,,0.104919,,0.104919,,pearson_corr,1.0,True,False,False,False,0.499594
93485,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Hispanic,,0.014732,,0.014732,,pearson_corr,1.0,True,False,False,False,0.499594
93486,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Other,,0.0751798,,0.07518,,pearson_corr,1.0,True,False,False,False,0.499594
93488,-0.043965,0.043965,aggregate-subgroup,num_stops,decriminalization,year,0,,0.0255481,,0.025548,,pearson_corr,1.0,True,False,False,False,0.499594
93490,-0.043965,0.043965,aggregate-subgroup,num_stops,medical,year,0,,0.0440008,,0.044001,,pearson_corr,1.0,True,False,False,False,0.499594


In [48]:
labeled_df.add_view_score('SP_thresh0.2',agg_type='sum',colored=False).head(10)

Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
0,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,AZ,,0.301916,,0.301916,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
1,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,MA,,0.115809,,0.115809,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
2,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,NC,,0.117245,,0.117245,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
3,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,RI,,0.0583105,,0.058311,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
4,-0.043965,0.043965,aggregate-subgroup,num_stops,state,year,VT,,0.0444106,,0.044411,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
5,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Asian,,0.104919,,0.104919,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
6,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Hispanic,,0.014732,,0.014732,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
7,-0.043965,0.043965,aggregate-subgroup,num_stops,driver_race,year,Other,,0.0751798,,0.07518,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
8,-0.043965,0.043965,aggregate-subgroup,num_stops,decriminalization,year,0,,0.0255481,,0.025548,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0
9,-0.043965,0.043965,aggregate-subgroup,num_stops,medical,year,0,,0.0440008,,0.044001,,pearson_corr,1.0,True,False,False,False,0.499594,25935.0


In [49]:
labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2','SP_thresh0.2').head()

Index(['agg_trend', 'agg_trend_strength', 'comparison_type', 'dependent',
       'group_feat', 'independent', 'subgroup', 'subgroup2', 'subgroup_trend',
       'subgroup_trend2', 'subgroup_trend_strength',
       'subgroup_trend_strength2', 'trend_type', 'distance', 'SP_thresh0.2',
       'default_qual_sp', 'SP', 'lin_only_qual_sp', 'mean_view_distance',
       'sum_view_SP_thresh0.2'],
      dtype='object')


Unnamed: 0,agg_trend,agg_trend_strength,comparison_type,dependent,group_feat,independent,subgroup,subgroup2,subgroup_trend,subgroup_trend2,subgroup_trend_strength,subgroup_trend_strength2,trend_type,distance,SP_thresh0.2,default_qual_sp,SP,lin_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
357194,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,CT,,0.118238,,0.118238,,pearson_corr,1.0,True,False,False,False,0.38694,31604.0
357195,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,SC,,0.000440801,,0.000441,,pearson_corr,1.0,True,False,False,False,0.38694,31604.0
357196,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,state,year,WI,,0.0485728,,0.048573,,pearson_corr,1.0,True,False,False,False,0.38694,31604.0
357197,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,driver_gender,year,F,,0.0304108,,0.030411,,pearson_corr,1.0,True,False,False,False,0.38694,31604.0
357198,-0.0390297,0.03903,aggregate-subgroup,search_conducted_rate,driver_race,year,Other,,0.134696,,0.134696,,pearson_corr,1.0,True,False,False,False,0.38694,31604.0
