In [1]:
import pandas as pd
import os
import detect_simpsons_paradox as dsp
import numpy as np
import itertools

In [2]:
labeled_df = dsp.labeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[explanatory, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,binary,"[explanatory, groupby]",False,
driver_race,object,categorical,"[explanatory, groupby]",False,
decriminalization,int64,binary,"[explanatory, groupby]",False,
medical,int64,binary,"[explanatory, groupby]",False,
recreational,int64,binary,"[explanatory, groupby]",False,
no_reforms,int64,binary,"[explanatory, groupby]",False,
search_conducted_false,float64,continuous,[trend],True,
search_conducted_true,float64,continuous,[trend],True,


In [3]:
rankobj = dsp.mean_rank_trend()
linregobj = dsp.linear_trend()

The tren objects above will compute all pairs of given types, but what if we want to define custom trends?  We can do that by overloading existing types.  We'll overload only the get_trend_vars() function for now, but the other functions can also be overloaded or a totally new trend can be added as long as it is compatible.

In [4]:
class min_lin_reg(dsp.linear_trend):
    symmetric_vars = False # tell it not to do combinations
    
    def get_trend_vars(self,labeled_df=None):
        """
        """
        x_vars = ['year']
        y_vars = ['search_conducted_rate','contraband_found_rate','hit_rate','num_stops','search_conducted_true']
        
        self.regression_vars = list(itertools.product(x_vars,y_vars))
        return self.regression_vars
    
min_lin_reg_obj = min_lin_reg()
# min_lin_reg_obj.get_trend_vars()

In [5]:
labeled_df.get_subgroup_trends_1lev([min_lin_reg_obj])
labeled_df.result_df.sample(10)

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup,subgroup_trendquality,group_feat,trend_type,agg_trend,agg_trendquality
79,year,hit_rate,,1,,medical,lin_reg,,
116,year,search_conducted_true,-24.885912,MA,-0.075316,state,lin_reg,-55.628632,-0.062242
16,year,search_conducted_rate,-0.000556,Black,-0.039174,driver_race,lin_reg,-0.000353,-0.03903
42,year,contraband_found_rate,,M,,driver_gender,lin_reg,,
107,year,num_stops,925.636733,1,0.031944,medical,lin_reg,-1560.861569,-0.043965
121,year,search_conducted_true,-592.846667,TX,-0.227839,state,lin_reg,-55.628632,-0.062242
47,year,contraband_found_rate,,White,,driver_race,lin_reg,,
103,year,num_stops,-5591.729931,White,-0.097383,driver_race,lin_reg,-1560.861569,-0.043965
76,year,hit_rate,,0,,decriminalization,lin_reg,,
6,year,search_conducted_rate,-0.000406,NC,-0.287439,state,lin_reg,-0.000353,-0.03903


# Component-wise

We can also use the components of trends to construct custom trends

In [6]:
from detect_simpsons_paradox import trend_components as tcomp

In [7]:
def w_median(df,mcol,wcol):
    """
    """
    if pd.isna(wcol):
        wmed = df[mcol].median()
    else:
        reps = [int(n) for n in df[wcol].values]
        repd_mcol = np.repeat(df[mcol].values,reps)
        wmed = np.median(df[mcol]*df[wcol])
    return wmed
    
    
class weightedMedianRank(tcomp.weightedMeanRank):
    """
    common parts for all continuous variable trends
    """
    # remove self
    my_stat = lambda self, d,m,w :w_median(d,m,w )

#     def get_trend_vars(self,labeled_df):
#         """
#         """
#         # maybe not counts

#         self.target = labeled_df.get_vars_per_roletype('trend',['binary','continuous'])
#         self.trendgroup = labeled_df.get_vars_per_roletype(['trend','explanatory'],'categorical')
#         self.var_weight_list = labeled_df.get_weightcol_per_var(self.target)
#         return self.target, self.trendgroup

class median_rank_trend(tcomp.rankTrend,weightedMedianRank,tcomp.trend):
    name = 'median_rank_trend'

In [8]:
medianrankobj = median_rank_trend()
labeled_df.get_subgroup_trends_1lev([min_lin_reg_obj,medianrankobj])
labeled_df.result_df.sample(10)

Unnamed: 0,feat1,feat2,group_feat,subgroup,subgroup_trend,subgroup_trendquality,trend_type,agg_trend,agg_trendquality
342,hit_rate,driver_race,medical,1,"[Other, Asian, Hispanic, Black, White]",,medain_rank_trend,"[Hispanic, Other, Asian, Black, White]",
199,search_conducted_rate,driver_race,driver_gender,F,"[Asian, White, Other, Black, Hispanic]",,medain_rank_trend,"[Asian, White, Other, Hispanic, Black]",
169,search_conducted_true,driver_race,state,NC,"[Asian, Other, Hispanic, Black, White]",,medain_rank_trend,"[Asian, Other, Hispanic, Black, White]",
428,contraband_found_false,state,no_reforms,0,"[VT, RI, WI, CT, MD, CO, MA, AZ, IL, NC, WA, S...",,medain_rank_trend,"[VT, RI, WI, CT, MD, CO, MA, AZ, IL, NC, WA, S...",
318,hit_true,driver_race,medical,0,"[Asian, Other, Hispanic, Black, White]",,medain_rank_trend,"[Asian, Other, Hispanic, Black, White]",
505,num_stops,state,driver_gender,F,"[VT, RI, WI, CT, MD, CO, IL, MA, AZ, NC, WA, S...",,medain_rank_trend,"[VT, RI, WI, CT, MD, CO, MA, AZ, IL, NC, WA, S...",
15,year,search_conducted_rate,driver_race,Asian,-0.000317509,-0.120567,lin_reg,-0.000353381,-0.03903
402,search_conducted_rate,state,driver_race,Asian,"[CO, NC, CT, VT, MA, TX, MD, WI, IL, WA, RI, AZ]",,medain_rank_trend,"[CO, NC, MD, CT, MA, TX, SC, VT, WI, RI, IL, W...",
203,search_conducted_rate,driver_race,medical,0,"[Asian, White, Other, Hispanic, Black]",,medain_rank_trend,"[Asian, White, Other, Hispanic, Black]",
171,search_conducted_true,driver_race,state,SC,"[Other, Hispanic, Black, White]",,medain_rank_trend,"[Asian, Other, Hispanic, Black, White]",
