In [1]:
import pandas as pd
import os
import wiggum as wg
import numpy as np
import itertools

In [2]:
labeled_df = wg.LabeledDataFrame('../data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[trend, groupby]",False,
year,int64,ordinal,[trend],False,
driver_gender,object,categorical,"[trend, groupby]",False,
driver_race,object,categorical,"[trend, groupby]",False,
decriminalization,int64,binary,[groupby],False,
medical,int64,binary,[groupby],False,
recreational,int64,binary,[groupby],False,
no_reforms,int64,binary,[groupby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


In [3]:
rankobj = wg.Mean_Rank_Trend()
linregobj = wg.Linear_Trend()

In [4]:
linregobj.get_trend_vars(labeled_df)
linregobj.regression_vars

['search_conducted_rate', 'contraband_found_rate', 'hit_rate', 'num_stops']

The tren objects above will compute all pairs of given types, but what if we want to define custom trends?  We can do that by overloading existing types.  We'll overload only the get_trend_vars() function for now, but the other functions can also be overloaded or a totally new trend can be added as long as it is compatible.

In [5]:
class min_lin_reg(wg.Linear_Trend):
    symmetric_vars = False # tell it not to do combinations
    
    def get_trend_vars(self,labeled_df=None):
        """
        """
        x_vars = ['year']
        y_vars = ['search_conducted_rate','contraband_found_rate','hit_rate','num_stops','search_conducted_true']
        
        self.regression_vars = list(itertools.product(x_vars,y_vars))
        return self.regression_vars
    
min_lin_reg_obj = min_lin_reg()
min_lin_reg_obj.get_trend_vars()

[('year', 'search_conducted_rate'),
 ('year', 'contraband_found_rate'),
 ('year', 'hit_rate'),
 ('year', 'num_stops'),
 ('year', 'search_conducted_true')]

In [6]:
# labeled_df.get_subgroup_trends_1lev([min_lin_reg_obj])
# labeled_df.result_df.sample(10)

# Component-wise

We can also use the components of trends to construct custom trends

In [7]:
from detect_simpsons_paradox import trend_components as tcomp

In [9]:
def w_median(df,mcol,wcol):
    """
    """
    if pd.isna(wcol):
        wmed ,upper,lower = np.quantile(df[mcol],[.5,.25,.75])
    else:
        reps = [int(n) for n in df[wcol].values]
        reps_mcol = np.repeat(df[mcol].values,reps)
        wmed,upper,lower =np.quantile( reps_mcol,[.5,.25,.75])

    return pd.Series([wmed ,upper,lower],index=['stat','max','min'])
    
    
class weightedMedianRank(tcomp.WeightedRank):
    """
    common parts for all continuous variable trends
    """
    # remove self
    my_stat = lambda self, d,m,w :w_median(d,m,w )

#     def get_trend_vars(self,labeled_df):
#         """
#         """
#         # maybe not counts

#         self.target = labeled_df.get_vars_per_roletype('trend',['binary','continuous'])
#         self.trendgroup = labeled_df.get_vars_per_roletype(['trend','explanatory'],'categorical')
#         self.var_weight_list = labeled_df.get_weightcol_per_var(self.target)
#         return self.target, self.trendgroup

class custuom_Median_Rank_Trend(tcomp.StatRankTrend,weightedMedianRank,tcomp.trend):
    name = 'Median_Rank_Trend'

In [10]:
medianrankobj = custuom_Median_Rank_Trend()
labeled_df.get_subgroup_trends_1lev([medianrankobj,rankobj])
view_sample = labeled_df.result_df.sample(10)
view_idx = view_sample.index
view_sample

  interpolation=interpolation)


Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_quality,subgroup,group_feat,trend_type,agg_trend,agg_trend_quality
261,search_conducted_rate,driver_gender,"[F, M]",0.187,MD,state,rank_trend,"[F, M]",0.2519
350,contraband_found_rate,driver_race,"[Asian, White, Other, Hispanic, Black]",0.0534,0,recreational,rank_trend,"[Asian, White, Other, Hispanic, Black]",0.0716
16,search_conducted_rate,driver_gender,"[F, M]",0.1447,Other,driver_race,Median_Rank_Trend,"[F, M]",0.2519
414,num_stops,driver_gender,"[F, M]",0.45,WA,state,rank_trend,"[F, M]",0.1164
427,num_stops,driver_gender,"[F, M]",0.108,0,no_reforms,rank_trend,"[F, M]",0.1164
125,hit_rate,driver_race,"[Hispanic, White, Asian, Black, Other]",0.3141,CO,state,Median_Rank_Trend,"[Asian, Black, Hispanic, Other, White]",0.0444
352,contraband_found_rate,driver_race,"[Asian, White, Other, Hispanic, Black]",0.1521,0,no_reforms,rank_trend,"[Asian, White, Other, Hispanic, Black]",0.0716
146,hit_rate,driver_race,"[Asian, Black, Hispanic, Other, White]",0.0694,1,no_reforms,Median_Rank_Trend,"[Asian, Black, Hispanic, Other, White]",0.0444
164,num_stops,driver_gender,"[F, M]",0.3621,White,driver_race,Median_Rank_Trend,"[F, M]",0.1164
233,hit_rate,state,"[WA, AZ, IL, MA, MD, RI, SC, VT, WI]",0.4854,0,decriminalization,Median_Rank_Trend,"[WA, AZ, CO, CT, IL, MA, MD, NC, RI, SC, TX, V...",0.2468


This is general enough that is is actually included and available generally, but other stats could be computed similarly

In [11]:
med_rank_obj = wg.Median_Rank_Trend()
labeled_df.get_subgroup_trends_1lev([rankobj,med_rank_obj])
labeled_df.result_df.iloc[view_idx]

Unnamed: 0,feat1,feat2,subgroup_trend,subgroup_trend_quality,subgroup,group_feat,trend_type,agg_trend,agg_trend_quality
261,search_conducted_rate,driver_gender,"[F, M]",0.187,MD,state,Median_Rank_Trend,"[F, M]",0.2519
350,contraband_found_rate,driver_race,"[Asian, White, Other, Hispanic, Black]",0.0534,0,recreational,Median_Rank_Trend,"[Asian, White, Other, Hispanic, Black]",0.0716
16,search_conducted_rate,driver_gender,"[F, M]",0.1447,Other,driver_race,rank_trend,"[F, M]",0.2519
414,num_stops,driver_gender,"[F, M]",0.45,WA,state,Median_Rank_Trend,"[F, M]",0.1164
427,num_stops,driver_gender,"[F, M]",0.108,0,no_reforms,Median_Rank_Trend,"[F, M]",0.1164
125,hit_rate,driver_race,"[Hispanic, Black, Asian, White, Other]",0.492,CO,state,rank_trend,"[Asian, Hispanic, Other, Black, White]",0.0023
352,contraband_found_rate,driver_race,"[Asian, White, Other, Hispanic, Black]",0.1521,0,no_reforms,Median_Rank_Trend,"[Asian, White, Other, Hispanic, Black]",0.0716
146,hit_rate,driver_race,"[Hispanic, Asian, Black, Other, White]",0.0155,1,no_reforms,rank_trend,"[Asian, Hispanic, Other, Black, White]",0.0023
164,num_stops,driver_gender,"[F, M]",0.3621,White,driver_race,rank_trend,"[F, M]",0.1164
233,hit_rate,state,"[WA, IL, AZ, MD, SC, RI, MA, WI, VT]",0.4449,0,decriminalization,rank_trend,"[WA, NC, IL, AZ, SC, MD, TX, CT, RI, MA, WI, C...",0.2193


In [12]:
help(med_rank_obj.my_stat)

Help on method <lambda> in module detect_simpsons_paradox.trends:

<lambda> lambda d, m, w method of detect_simpsons_paradox.trends.Median_Rank_Trend instance



In [13]:
help(tcomp.w_median)

Help on function w_median in module detect_simpsons_paradox.trend_components.base_getvars:

w_median(df, mcol, wcol)
    compute the median or median with replication according to weights, gives a
    confidence interval specified by the middle 50%
    
    Parameters
    ----------
    df : DataFrame or DataFrameGroupBy
        passed as the source of apply, the data to extract columns from for
        computing a weighted average
    mcol : string
        name of column in df to take the average of
    wcol : string
        name of column in df to use for weighting
    
    Returns
    -------
    wmed : float
        median of df[avcol] weighted row wise by df[wcol]

