In [1]:
import pandas as pd
import os
import detect_simpsons_paradox as dsp
import numpy as np
import itertools

In [2]:
labeled_df = dsp.labeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[explanatory, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,binary,"[explanatory, groupby]",False,
driver_race,object,categorical,"[explanatory, groupby]",False,
decriminalization,int64,binary,"[explanatory, groupby]",False,
medical,int64,binary,"[explanatory, groupby]",False,
recreational,int64,binary,"[explanatory, groupby]",False,
no_reforms,int64,binary,"[explanatory, groupby]",False,
search_conducted_false,float64,continuous,[trend],True,
search_conducted_true,float64,continuous,[trend],True,


In [3]:
rankobj = dsp.mean_rank_trend()
linregobj = dsp.linear_trend()

In [4]:
linregobj.get_trend_vars(labeled_df)
linregobj.regression_vars

Index(['search_conducted_false', 'search_conducted_true',
       'search_conducted_rate', 'contraband_found_false',
       'contraband_found_true', 'contraband_found_rate', 'hit_false',
       'hit_true', 'hit_rate', 'num_stops'],
      dtype='object', name='variable')

The tren objects above will compute all pairs of given types, but what if we want to define custom trends?  We can do that by overloading existing types.  We'll overload only the get_trend_vars() function for now, but the other functions can also be overloaded or a totally new trend can be added as long as it is compatible.

In [5]:
class min_lin_reg(dsp.linear_trend):
    symmetric_vars = False # tell it not to do combinations
    
    def get_trend_vars(self,labeled_df=None):
        """
        """
        x_vars = ['year']
        y_vars = ['search_conducted_rate','contraband_found_rate','hit_rate','num_stops','search_conducted_true']
        
        self.regression_vars = list(itertools.product(x_vars,y_vars))
        return self.regression_vars
    
min_lin_reg_obj = min_lin_reg()
min_lin_reg_obj.get_trend_vars()

[('year', 'search_conducted_rate'),
 ('year', 'contraband_found_rate'),
 ('year', 'hit_rate'),
 ('year', 'num_stops'),
 ('year', 'search_conducted_true')]

In [6]:
labeled_df.get_subgroup_trends_1lev([min_lin_reg_obj])
labeled_df.result_df.sample(10)

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup,subgroup_trendquality,group_feat,trend_type,agg_trend,agg_trendquality
92,year,num_stops,-1123.356643,SC,-0.039445,state,lin_reg,-1560.861569,-0.043965
11,year,search_conducted_rate,-0.003524,WA,-0.345375,state,lin_reg,-0.000353,-0.03903
79,year,hit_rate,,1,,medical,lin_reg,,
17,year,search_conducted_rate,-0.001483,Hispanic,-0.177694,driver_race,lin_reg,-0.000353,-0.03903
40,year,contraband_found_rate,,WI,,state,lin_reg,,
35,year,contraband_found_rate,,RI,,state,lin_reg,,
102,year,num_stops,91.512747,Other,0.07518,driver_race,lin_reg,-1560.861569,-0.043965
123,year,search_conducted_true,-347.096429,WA,-0.195117,state,lin_reg,-55.628632,-0.062242
104,year,num_stops,637.750633,0,0.025548,decriminalization,lin_reg,-1560.861569,-0.043965
85,year,num_stops,-5034.006633,CO,-0.166912,state,lin_reg,-1560.861569,-0.043965


# Component-wise

We can also use the components of trends to construct custom trends

In [7]:
from detect_simpsons_paradox import trend_components as tcomp

In [8]:
def w_median(df,mcol,wcol):
    """
    """
    if pd.isna(wcol):
        wmed = df[mcol].median()
    else:
        reps = [int(n) for n in df[wcol].values]
        repd_mcol = np.repeat(df[mcol].values,reps)
        wmed = np.median(df[mcol]*df[wcol])
    return wmed
    
    
class weightedMedianRank(tcomp.weightedMeanRank):
    """
    common parts for all continuous variable trends
    """
    # remove self
    my_stat = lambda self, d,m,w :w_median(d,m,w )

#     def get_trend_vars(self,labeled_df):
#         """
#         """
#         # maybe not counts

#         self.target = labeled_df.get_vars_per_roletype('trend',['binary','continuous'])
#         self.trendgroup = labeled_df.get_vars_per_roletype(['trend','explanatory'],'categorical')
#         self.var_weight_list = labeled_df.get_weightcol_per_var(self.target)
#         return self.target, self.trendgroup

class median_rank_trend(tcomp.rankTrend,weightedMedianRank,tcomp.trend):
    name = 'median_rank_trend'

In [9]:
medianrankobj = median_rank_trend()
labeled_df.get_subgroup_trends_1lev([min_lin_reg_obj,medianrankobj,rankobj])
labeled_df.result_df.sample(10)

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trendquality,trend_type,agg_trend,agg_trendquality
442,contraband_found_true,state,recreational,1,"[CO, WA]",,median_rank_trend,"[VT, MD, CO, NC, WI, CT, RI, MA, WA, IL, AZ, S...",
11,year,search_conducted_rate,state,WA,-0.00352446,-0.345375,lin_reg,-0.000353381,-0.03903
100,year,num_stops,driver_race,Black,-2262.72,-0.169689,lin_reg,-1560.86,-0.043965
179,search_conducted_true,driver_race,decriminalization,1,"[Asian, Other, Hispanic, Black, White]",,median_rank_trend,"[Asian, Other, Hispanic, Black, White]",
53,year,contraband_found_rate,recreational,1,,,lin_reg,,
190,search_conducted_rate,driver_race,state,MA,"[Asian, Other, White, Black, Hispanic]",,median_rank_trend,"[Asian, White, Other, Hispanic, Black]",
116,year,search_conducted_true,state,MA,-24.8859,-0.075316,lin_reg,-55.6286,-0.062242
329,hit_rate,driver_race,state,MD,"[Hispanic, Black, Asian, Other, White]",,median_rank_trend,"[Hispanic, Other, Asian, Black, White]",
46,year,contraband_found_rate,driver_race,Other,,,lin_reg,,
467,hit_false,state,decriminalization,0,"[VT, WI, MA, MD, RI, IL, AZ, WA, SC]",,median_rank_trend,"[VT, WI, CO, MD, RI, CT, MA, NC, IL, AZ, WA, S...",
