In [1]:
import pandas as pd
import os
import wiggum as wg
import numpy as np

We'll first load in some data, this has both regression and rate type trends. We will load it two ways and check that the structure is the same

In [2]:
labeled_df_file = wg.LabeledDataFrame('../data/wages_gender_rank_time_regression2/df.csv')

In [3]:
labeled_df_dir = wg.LabeledDataFrame('../data/wages_gender_rank_time_regression2')

In [4]:
assert np.product(labeled_df_file.df.columns == labeled_df_dir.df.columns)

In [5]:
assert labeled_df_file.df.shape == labeled_df_dir.df.shape

In [6]:
compare_df = labeled_df_file.df == labeled_df_dir.df
assert np.product(compare_df.sum() == len(labeled_df_file.df))

Next, we can infer the variable types and assign the roles then check that those match what was read from the saved copy

In [7]:
labeled_df_file.infer_var_types()


roles = {'department':['independent','splitby'], 'year':['independent'], 
         'pay':['dependent'], 'gender':['independent','splitby']}

var_types = {'gender':'categorical'}
labeled_df_file.set_counts({var:False for var in labeled_df_file.df.columns})
labeled_df_file.set_roles(roles)
labeled_df_file.set_var_types(var_types)

In [8]:
labeled_df_file.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pay,float64,continuous,[dependent],False,
year,float64,continuous,[independent],False,
department,object,categorical,"[independent, splitby]",False,
gender,object,categorical,"[independent, splitby]",False,


In [9]:
labeled_df_dir.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pay,float64,continuous,[dependent],False,
year,float64,continuous,[independent],False,
department,object,categorical,"[independent, splitby]",False,
gender,object,categorical,"[independent, splitby]",False,


In [10]:
assert np.product(labeled_df_file.meta_df.columns == labeled_df_dir.meta_df.columns)

In [11]:
assert labeled_df_file.meta_df.shape == labeled_df_dir.meta_df.shape

In [12]:
compare_meta_df = labeled_df_file.meta_df.dropna(axis=1) == labeled_df_dir.meta_df.dropna(axis=1)
assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df))
# compare_meta_df
# labeled_df_dir.meta_df.dropna(axis=1)

Now, we've set this up, we can also save these configurations to load them in directly in the future

In [13]:
assert labeled_df_file.to_csvs('../data/wages_test')

Now confirm that all the files were written correctly.

In [14]:
assert os.listdir('../data/wages_test/') == ['df.csv', 'result_df.csv', 'meta.csv']

it write the three DataFrames each out to their own .csv file in that directory. If that directory exists it will overwrite without warning, if not, also creates the directory. 

Now, we can can also load the data back

In [15]:
labeled_df = wg.LabeledDataFrame('../data/wages_test')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pay,float64,continuous,[dependent],False,
year,float64,continuous,[independent],False,
department,object,categorical,"[independent, splitby]",False,
gender,object,categorical,"[independent, splitby]",False,


And confirm that thiss is the same as what was written. First confirm the column headings are the same

In [16]:
assert np.product(labeled_df.meta_df.columns == labeled_df_dir.meta_df.columns)

Then confirm the shape is the same

In [17]:
assert labeled_df.meta_df.shape == labeled_df_dir.meta_df.shape

Then that non NaN values are all the same, combined with above the NaNs must be in the same location, but np.NaN == np.Nan asserts to false

In [18]:
compare_meta_df = labeled_df.meta_df.dropna(axis=1) == labeled_df_dir.meta_df.dropna(axis=1)
assert np.product(compare_meta_df.sum() == len(labeled_df_dir.meta_df))
# compare_meta_df
# labeled_df_dir.meta_df.dropna(axis=1)

In [19]:
assert np.product(labeled_df.df.columns == labeled_df_dir.df.columns)

In [20]:
assert labeled_df.df.shape == labeled_df_dir.df.shape

In [21]:
compare_df = labeled_df.df.dropna(axis=1) == labeled_df_dir.df.dropna(axis=1)
assert np.product(compare_df.sum() == len(labeled_df_dir.df))
# compare_meta_df
# labeled_df_dir.meta_df.dropna(axis=1)

In [22]:
intersect_cols= ['gender','department']
labeled_df.add_intersectional(intersect_cols)

Unnamed: 0,pay,year,department,gender,gender_department
,,,,,
0,22.63,0.0,Support,F,F_Support
1,24.74,0.0,Support,F,F_Support
2,21.16,0.0,Support,M,M_Support
3,21.11,0.0,Support,M,M_Support
4,19.39,0.0,Support,M,M_Support
5,21.07,0.0,Support,M,M_Support
6,23.14,0.0,Support,F,F_Support
7,23.14,0.0,Support,F,F_Support
8,24.06,0.0,Support,F,F_Support


In [23]:
intersectional_col_name = '_'.join(intersect_cols)
intersectional_correct = lambda row: row[intersectional_col_name] == '_'.join([row[icol] for icol in intersect_cols])
icol_correct = labeled_df.df.apply(intersectional_correct,axis=1)
assert np.product(icol_correct)

In [24]:
labeled_df.add_quantile(['pay'])
labeled_df.df.head()

Unnamed: 0,pay,year,department,gender,gender_department,payquantiles
,,,,,,
0.0,22.63,0.0,Support,F,F_Support,mid
1.0,24.74,0.0,Support,F,F_Support,mid
2.0,21.16,0.0,Support,M,M_Support,low
3.0,21.11,0.0,Support,M,M_Support,low
4.0,19.39,0.0,Support,M,M_Support,low


In [25]:
q_limits = np.quantile(labeled_df.df['pay'],[.25,.75,1],)
limits = {n:q for n,q in zip(['low','mid','high'],q_limits)}
for q,df in labeled_df.df.groupby('payquantiles'):
    a = df['pay'] <= limits[q]
    assert np.product(a)

In [26]:
assert labeled_df.get_vars_per_type('categorical') == ['department', 'gender', 'gender_department', 'payquantiles']

In [27]:
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pay,float64,continuous,[dependent],False,
year,float64,continuous,[independent],False,
department,object,categorical,"[independent, splitby]",False,
gender,object,categorical,"[independent, splitby]",False,
gender_department,object,categorical,splitby,False,
payquantiles,object,categorical,splitby,False,


In [28]:
assert labeled_df.meta_df.loc['gender_department','dtype'] == 'object'
assert labeled_df.meta_df.loc['gender_department','var_type']  ==  'categorical'
assert labeled_df.meta_df.loc['gender_department','role']  == 'splitby'
assert labeled_df.meta_df.loc['gender_department','isCount']  ==  False

Check the utility fucntions

In [29]:
assert labeled_df.get_vars_per_role('splitby') == ['department', 'gender', 'gender_department', 'payquantiles']
assert labeled_df.get_vars_per_role('independent') == ['year','department', 'gender']
assert labeled_df.get_vars_per_role('dependent') == ['pay']

In [30]:
assert labeled_df.get_data_sample() == ['Max: 51.04 Min: 13.52',
 'Max: 50.0 Min: 0.0',
 'Support, Sales, Management, R&D',
 'F, M',
 'F_Support, M_Support, M_Sales, F_Sales, M_Management',
 'mid, low, high']

In [31]:
assert labeled_df.get_vars_per_type('categorical') == ['department', 'gender', 'gender_department', 'payquantiles']
assert labeled_df.get_vars_per_type('continuous') == ['pay','year']

In [32]:
assert labeled_df.get_vars_per_roletype('independent','continuous') == ['year']
assert labeled_df.get_vars_per_roletype('independent','categorical') ==['department', 'gender']

# Using Trends

Trend objects define their name, how to compute the trend and how to choose which variables, 

extension will allow that the var lists may be passed to reduce which ones are computed

In [33]:
corrobj = wg.All_Pearson()
corrobj.get_trend_vars(labeled_df)
assert corrobj.regression_vars == [('year', 'pay')]
assert len(corrobj.var_weight_list) == len(corrobj.regression_vars)
assert corrobj.set_vars== True

In [34]:
rankobj = wg.Mean_Rank_Trend()
assert rankobj.get_trend_vars(labeled_df)
assert rankobj.target ==['pay']
assert rankobj.trendgroup == ['department', 'gender']
assert rankobj.set_vars== True
assert len(rankobj.var_weight_list) == len(rankobj.target)

In [35]:
linreg_obj = wg.All_Linear_Trend()
linreg_obj.get_trend_vars(labeled_df)
assert linreg_obj.regression_vars == [('year', 'pay')]
assert len(linreg_obj.var_weight_list) == len(linreg_obj.regression_vars)
assert linreg_obj.set_vars== True


# Computing Trends on a LabeledDataFrame

There are two ways, we can use default setting and pass the names of the trend type or a trend object

In [36]:
labeled_df.get_subgroup_trends_1lev(['pearson_corr'])
labeled_df.result_df

Unnamed: 0,independent,dependent,group_feat,subgroup,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,trend_type,comparison_type
0,year,pay,department,Management,0.039252,0.039252,-0.179974,0.179974,pearson_corr,aggregate-subgroup
1,year,pay,department,R&D,0.039252,0.039252,-0.711535,0.711535,pearson_corr,aggregate-subgroup
2,year,pay,department,Sales,0.039252,0.039252,-0.730384,0.730384,pearson_corr,aggregate-subgroup
3,year,pay,department,Support,0.039252,0.039252,-0.806535,0.806535,pearson_corr,aggregate-subgroup
4,year,pay,gender,F,0.039252,0.039252,0.0194,0.0194,pearson_corr,aggregate-subgroup
5,year,pay,gender,M,0.039252,0.039252,0.053508,0.053508,pearson_corr,aggregate-subgroup
6,year,pay,gender_department,F_Management,0.039252,0.039252,-0.293309,0.293309,pearson_corr,aggregate-subgroup
7,year,pay,gender_department,F_R&D,0.039252,0.039252,-0.833092,0.833092,pearson_corr,aggregate-subgroup
8,year,pay,gender_department,F_Sales,0.039252,0.039252,-0.867825,0.867825,pearson_corr,aggregate-subgroup
9,year,pay,gender_department,F_Support,0.039252,0.039252,-0.906787,0.906787,pearson_corr,aggregate-subgroup


In [37]:
assert np.product(labeled_df.result_df.columns == ['independent', 'dependent', 'group_feat', 'subgroup', 'agg_trend',
       'agg_trend_strength', 'subgroup_trend', 'subgroup_trend_strength',
       'trend_type', 'comparison_type'])

In [38]:
# there are 10 fixed columns and the number of rows for this trend is below
num_reg_pairs = 1
num_depts = 4
num_genders = 2
num_quantiles = 3
num_dept_genders = num_genders*num_depts
num_pearson = num_reg_pairs*(num_depts+num_genders + num_dept_genders+ num_quantiles )
assert labeled_df.result_df.shape == (num_pearson,10)

Now we can use a list of objects and apply multiple trends

In [39]:
labeled_df.get_subgroup_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.sample(10)

Unnamed: 0,independent,dependent,group_feat,subgroup,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,trend_type,comparison_type
36,year,pay,gender_department,F_Sales,0.0270634,0.039252,-0.0851314,0.867825,lin_reg,aggregate-subgroup
17,department,pay,gender_department,F_Management,"[Support, Sales, R&D, Management]",0.9488,[Management],,rank_trend,aggregate-subgroup
34,year,pay,gender_department,F_Management,0.0270634,0.039252,-0.015061,0.293309,lin_reg,aggregate-subgroup
42,year,pay,payquantiles,high,0.0270634,0.039252,-0.0118062,0.155876,lin_reg,aggregate-subgroup
12,year,pay,gender_department,M_Sales,0.0392519,0.039252,-0.87263,0.87263,pearson_corr,aggregate-subgroup
13,gender,pay,payquantiles,low,"[F, M]",0.1774,"[M, F]",0.2234,rank_trend,aggregate-subgroup
2,gender,pay,department,Sales,"[F, M]",0.1774,"[M, F]",0.4282,rank_trend,aggregate-subgroup
28,year,pay,department,Management,0.0270634,0.039252,-0.0145483,0.179974,lin_reg,aggregate-subgroup
37,year,pay,gender_department,F_Support,0.0270634,0.039252,-0.101315,0.906787,lin_reg,aggregate-subgroup
23,department,pay,gender_department,M_Sales,"[Support, Sales, R&D, Management]",0.9488,[Sales],,rank_trend,aggregate-subgroup


Now check that the correct number of trends was found

In [40]:
num_lin = num_pearson
num_gender_idep = num_depts + num_dept_genders+ num_quantiles 
num_dept_indep = num_genders + num_dept_genders+ num_quantiles 
num_rank = num_gender_idep + num_dept_indep
total_rows_agg_sg = num_pearson + num_lin + num_rank
assert labeled_df.result_df.shape == (total_rows_agg_sg,10)

We can see what types of trends were computed from `result_df`

In [41]:
assert np.product(pd.unique(labeled_df.result_df['trend_type']) ==['pearson_corr', 'rank_trend', 'lin_reg'])

In [42]:
assert pd.unique(labeled_df.result_df['comparison_type']) ==['aggregate-subgroup']

We can also add trends that are structured for pairwise comparisons

In [43]:
labeled_df.get_pairwise_trends_1lev([rankobj,linreg_obj])
labeled_df.result_df.head()

Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type
0,year,pay,department,Management,,0.0392519,0.039252,-0.179974,0.179974,,,pearson_corr,aggregate-subgroup
1,year,pay,department,R&D,,0.0392519,0.039252,-0.711535,0.711535,,,pearson_corr,aggregate-subgroup
2,year,pay,department,Sales,,0.0392519,0.039252,-0.730384,0.730384,,,pearson_corr,aggregate-subgroup
3,year,pay,department,Support,,0.0392519,0.039252,-0.806535,0.806535,,,pearson_corr,aggregate-subgroup
4,year,pay,gender,F,,0.0392519,0.039252,0.0194004,0.0194,,,pearson_corr,aggregate-subgroup


Again, check that the infrastructure of this by checking that the number of rows is correct

In [44]:
num_dept_pairs = np.sum(list(range(num_depts)))
num_gender_pairs = np.sum(list(range(num_genders)))
num_dept_genders_pairs = np.sum(list(range(num_dept_genders)))
num_quantile_pairs = np.sum(list(range(num_quantiles)))
gender_indep_pairwise_rows = num_dept_pairs  + num_dept_genders_pairs + num_quantile_pairs
dept_indep_pairwise_rows = num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs
lin_reg_pairwise_rows = num_dept_pairs +num_gender_pairs + num_dept_genders_pairs + num_quantile_pairs
rank_pairwise_rows = gender_indep_pairwise_rows + dept_indep_pairwise_rows
total_rows = total_rows_agg_sg + lin_reg_pairwise_rows + rank_pairwise_rows
assert labeled_df.result_df.shape == (total_rows,13)


In [45]:
assert list(pd.unique(labeled_df.result_df['comparison_type'])) ==['aggregate-subgroup', 'pairwise']

The object also stores the trend objects that have been applied, they can be used for mapping to get the distance functions that are appropriate for each trend

In [46]:
labeled_df.trend_list

[<wiggum.trends.All_Pearson at 0x7f5342f31898>,
 <wiggum.trends.Mean_Rank_Trend at 0x7f5342f2d6d8>,
 <wiggum.trends.All_Linear_Trend at 0x7f5342f31f28>,
 <wiggum.trends.Mean_Rank_Trend at 0x7f5342f2d6d8>,
 <wiggum.trends.All_Linear_Trend at 0x7f5342f31f28>]

In [47]:
# labeled_df.result_df['distance'] = labeled_df.result_df.apply(dist_helper,axis=1)
labeled_df.add_distance(row_wise=True) #('subgroup_trend','subgroup_trend2')
assert labeled_df.result_df.shape == (total_rows,14)

  size * (size - 1) * (size - 2))


Each trend object has a trend_precompute dictionary as a property that stores the intermediate values (tables of the weighted rates for ranks and correlation matrices for pearson correlation, TODO: what do we need for linreg). These can be used in vizualization.

# Saving with trends

In [48]:
assert labeled_df.save_all('../data/wages_test_all')

In [49]:
assert sorted(os.listdir('../data/wages_test_all/')) == ['df.csv', 'meta.csv', 'result_df.csv', 'trends.json']

In [50]:
labeled_df_tl = wg.LabeledDataFrame('../data/wages_test_all')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['trend_type'] = tt


That save function calls the save function tested above, we only need to test that the trend list loaded correctly

In [51]:
labeled_df.trend_list[0].trend_precompute

{'pearson_corr_agg_trend': 'year,pay\n1.0,0.03925185926574491\n0.03925185926574491,1.0\n',
 'pearson_corr_subgroup_trend': 'year,pay\n1.0,-0.15587584568473287\n-0.15587584568473287,1.0\n1.0,-0.586587942060879\n-0.586587942060879,1.0\n1.0,0.008008267893450967\n0.008008267893450967,1.0\n'}

In [52]:
labeled_df_tl.trend_list[0].trend_precompute

{'pearson_corr_agg_trend':        year       pay
 0  1.000000  0.039252
 1  0.039252  1.000000, 'pearson_corr_subgroup_trend':        year       pay
 0  1.000000 -0.155876
 1 -0.155876  1.000000
 2  1.000000 -0.586588
 3 -0.586588  1.000000
 4  1.000000  0.008008
 5  0.008008  1.000000}

# Filtering

Test for each filter variable, one at a time and several pairs

In [53]:
year_df = labeled_df.get_trend_rows(independent='year')
pay_df = labeled_df.get_trend_rows(dependent='pay')
dept_df = labeled_df.get_trend_rows(group_feat='department')
mgmt_df = labeled_df.get_trend_rows(subgroup='Management')
sales_df = labeled_df.get_trend_rows(subgroup2='Sales')
linreg_df = labeled_df.get_trend_rows(trend_type ='lin_reg' )
pair_df = labeled_df.get_trend_rows(comparison_type='pairwise')

72  total rows meet the criteria
169  total rows meet the criteria
24  total rows meet the criteria
12  total rows meet the criteria
4  total rows meet the criteria
55  total rows meet the criteria
107  total rows meet the criteria


TODO: manually verify these counts

In [54]:
assert len(year_df)  == 72
assert len(pay_df)  == 169
assert len(dept_df)  == 24
assert len(mgmt_df)  == 12
assert len(sales_df)  == 4
assert len(linreg_df) == 55

assert len(pair_df) == lin_reg_pairwise_rows + rank_pairwise_rows

Now test two conditions and passing a list to a condition

In [55]:
y_sm_df = labeled_df.get_trend_rows(independent='year',subgroup=['Management','Sales'])
pay_rank = labeled_df.get_trend_rows(dependent='pay',trend_type='rank_trend')

8  total rows meet the criteria
97  total rows meet the criteria


We can also filter based on SP detections with `

In [56]:
labeled_df.get_SP_rows(thresh=.2)

Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type,distance,SP_thresh0.2
0,year,pay,department,Management,,0.0392519,0.039252,-0.179974,0.179974,,,pearson_corr,aggregate-subgroup,1.0,True
1,year,pay,department,R&D,,0.0392519,0.039252,-0.711535,0.711535,,,pearson_corr,aggregate-subgroup,1.0,True
2,year,pay,department,Sales,,0.0392519,0.039252,-0.730384,0.730384,,,pearson_corr,aggregate-subgroup,1.0,True
3,year,pay,department,Support,,0.0392519,0.039252,-0.806535,0.806535,,,pearson_corr,aggregate-subgroup,1.0,True
6,year,pay,gender_department,F_Management,,0.0392519,0.039252,-0.293309,0.293309,,,pearson_corr,aggregate-subgroup,1.0,True
7,year,pay,gender_department,F_R&D,,0.0392519,0.039252,-0.833092,0.833092,,,pearson_corr,aggregate-subgroup,1.0,True
8,year,pay,gender_department,F_Sales,,0.0392519,0.039252,-0.867825,0.867825,,,pearson_corr,aggregate-subgroup,1.0,True
9,year,pay,gender_department,F_Support,,0.0392519,0.039252,-0.906787,0.906787,,,pearson_corr,aggregate-subgroup,1.0,True
10,year,pay,gender_department,M_Management,,0.0392519,0.039252,-0.295762,0.295762,,,pearson_corr,aggregate-subgroup,1.0,True
11,year,pay,gender_department,M_R&D,,0.0392519,0.039252,-0.840029,0.840029,,,pearson_corr,aggregate-subgroup,1.0,True


In [57]:
assert labeled_df.result_df.shape == (total_rows,15)

## Detection

Detection via `get_SP_rows` happens in two steps:
1. label the rows
2. filter by that column to return

Labeling the rows can happen in a number of ways too, the detection accepts a number of forms of input, custom detections can be built in many ways

when filter_thresh is a dictionary, the filtering happens by taking the intersection of each row by the treshold prvided.  Some defaults are also built in accessible by string.

In [58]:
labeled_df.get_SP_rows('default_qual_sp')
assert labeled_df.result_df.shape == (total_rows,16)

Basic type checks on detections, TODO: accuracy on detections

In [59]:
assert labeled_df.result_df['SP_thresh0.2'].dtype ==bool
assert labeled_df.result_df['default_qual_sp'].dtype ==bool

In [60]:
labeled_df.get_SP_rows('SP')
assert labeled_df.result_df.shape == (total_rows,17)
assert labeled_df.result_df['SP'].dtype ==bool

We can also define our own detection filters, using any available column

In [61]:
rank_only_qual = {'name':'rank_only_qual_sp','distance':.2,'agg_trend_strength':.05,
                'subgroup_trend_strength':.05, 'trend_type':'rank_trend'}
labeled_df.get_SP_rows(rank_only_qual,replace=True) 

Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type,distance,SP_thresh0.2,default_qual_sp,SP,rank_only_qual_sp
0,gender,pay,department,Management,,"[F, M]",0.1774,"[M, F]",0.7962,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
1,gender,pay,department,R&D,,"[F, M]",0.1774,"[M, F]",0.463,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
2,gender,pay,department,Sales,,"[F, M]",0.1774,"[M, F]",0.4282,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
3,gender,pay,department,Support,,"[F, M]",0.1774,"[M, F]",0.3546,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
12,gender,pay,payquantiles,high,,"[F, M]",0.1774,"[M, F]",0.791,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
13,gender,pay,payquantiles,low,,"[F, M]",0.1774,"[M, F]",0.2234,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True
26,department,pay,payquantiles,low,,"[Support, Sales, R&D, Management]",0.9488,"[Support, Sales]",0.202,,,rank_trend,aggregate-subgroup,0.4087,True,True,False,True


# Ranking

In [62]:
labeled_df.rank_occurences_by_view(ascending=False).head(20)

Index(['independent', 'dependent', 'group_feat', 'subgroup', 'subgroup2',
       'agg_trend', 'agg_trend_strength', 'subgroup_trend',
       'subgroup_trend_strength', 'subgroup_trend2',
       'subgroup_trend_strength2', 'trend_type', 'comparison_type', 'distance',
       'SP_thresh0.2', 'default_qual_sp', 'SP', 'rank_only_qual_sp',
       'mean_view_distance'],
      dtype='object')


Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type,distance,SP_thresh0.2,default_qual_sp,SP,rank_only_qual_sp,mean_view_distance
72,gender,pay,department,Management,,"[F, M]",0.1774,"[M, F]",0.7962,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
73,gender,pay,department,R&D,,"[F, M]",0.1774,"[M, F]",0.463,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
74,gender,pay,department,Sales,,"[F, M]",0.1774,"[M, F]",0.4282,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
75,gender,pay,department,Support,,"[F, M]",0.1774,"[M, F]",0.3546,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
84,gender,pay,payquantiles,high,,"[F, M]",0.1774,"[M, F]",0.791,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
85,gender,pay,payquantiles,low,,"[F, M]",0.1774,"[M, F]",0.2234,,,rank_trend,aggregate-subgroup,1.0,True,True,False,True,0.5
122,gender,pay,payquantiles,high,mid,,,"[M, F]",0.791,"[F, M]",0.1072,rank_trend,pairwise,1.0,True,False,False,False,0.5
123,gender,pay,payquantiles,low,mid,,,"[M, F]",0.2234,"[F, M]",0.1072,rank_trend,pairwise,1.0,True,False,False,False,0.5
86,gender,pay,payquantiles,mid,,"[F, M]",0.1774,"[F, M]",0.1072,,,rank_trend,aggregate-subgroup,0.0,False,False,False,False,0.5
87,gender,pay,department,Management,R&D,,,"[M, F]",0.7962,"[M, F]",0.463,rank_trend,pairwise,0.0,False,False,False,False,0.5


In [63]:
labeled_df.add_view_score('SP_thresh0.2',agg_type='sum',colored=False)

Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type,distance,SP_thresh0.2,default_qual_sp,SP,rank_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
0,gender,pay,department,Management,,"[F, M]",0.177400,"[M, F]",0.796200,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
1,gender,pay,department,R&D,,"[F, M]",0.177400,"[M, F]",0.463000,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
2,gender,pay,department,Sales,,"[F, M]",0.177400,"[M, F]",0.428200,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
3,gender,pay,department,Support,,"[F, M]",0.177400,"[M, F]",0.354600,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
4,gender,pay,payquantiles,high,,"[F, M]",0.177400,"[M, F]",0.791000,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
5,gender,pay,payquantiles,low,,"[F, M]",0.177400,"[M, F]",0.223400,,,rank_trend,aggregate-subgroup,1.000000,True,True,False,True,0.500000,8.0
6,gender,pay,payquantiles,high,mid,,,"[M, F]",0.791000,"[F, M]",0.107200,rank_trend,pairwise,1.000000,True,False,False,False,0.500000,8.0
7,gender,pay,payquantiles,low,mid,,,"[M, F]",0.223400,"[F, M]",0.107200,rank_trend,pairwise,1.000000,True,False,False,False,0.500000,8.0
8,gender,pay,payquantiles,mid,,"[F, M]",0.177400,"[F, M]",0.107200,,,rank_trend,aggregate-subgroup,0.000000,False,False,False,False,0.500000,8.0
9,gender,pay,department,Management,R&D,,,"[M, F]",0.796200,"[M, F]",0.463000,rank_trend,pairwise,0.000000,False,False,False,False,0.500000,8.0


In [64]:
labeled_df.rank_occurences_by_view('sum_view_SP_thresh0.2','SP_thresh0.2')

Index(['independent', 'dependent', 'group_feat', 'subgroup', 'subgroup2',
       'agg_trend', 'agg_trend_strength', 'subgroup_trend',
       'subgroup_trend_strength', 'subgroup_trend2',
       'subgroup_trend_strength2', 'trend_type', 'comparison_type', 'distance',
       'SP_thresh0.2', 'default_qual_sp', 'SP', 'rank_only_qual_sp',
       'mean_view_distance', 'sum_view_SP_thresh0.2'],
      dtype='object')


Unnamed: 0,independent,dependent,group_feat,subgroup,subgroup2,agg_trend,agg_trend_strength,subgroup_trend,subgroup_trend_strength,subgroup_trend2,subgroup_trend_strength2,trend_type,comparison_type,distance,SP_thresh0.2,default_qual_sp,SP,rank_only_qual_sp,mean_view_distance,sum_view_SP_thresh0.2
97,year,pay,department,Management,,0.0392519,0.039252,-0.179974,0.179974,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
98,year,pay,department,R&D,,0.0392519,0.039252,-0.711535,0.711535,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
99,year,pay,department,Sales,,0.0392519,0.039252,-0.730384,0.730384,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
100,year,pay,department,Support,,0.0392519,0.039252,-0.806535,0.806535,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
101,year,pay,gender_department,F_Management,,0.0392519,0.039252,-0.293309,0.293309,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
102,year,pay,gender_department,F_R&D,,0.0392519,0.039252,-0.833092,0.833092,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
103,year,pay,gender_department,F_Sales,,0.0392519,0.039252,-0.867825,0.867825,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
104,year,pay,gender_department,F_Support,,0.0392519,0.039252,-0.906787,0.906787,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
105,year,pay,gender_department,M_Management,,0.0392519,0.039252,-0.295762,0.295762,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
106,year,pay,gender_department,M_R&D,,0.0392519,0.039252,-0.840029,0.840029,,,pearson_corr,aggregate-subgroup,1.000000,True,False,False,False,0.219486,14.0
