# Correlation Calculation
This note book tests the execution speed of two different functions for computing correlation within the statistical.py file. Both functions have been incorporated within the `CorrelationBase()` class. The orgional function can be called by `compute_correlation_table` while the second is called with `compute_correlation_table_V2` (for version 2).

In [1]:
import pandas as pd
import wiggum as wg
import numpy

In [2]:
labeled_df = wg.LabeledDataFrame('data/ldf_state_hit_rate_min_cols_COCTFLILMDMAMOMTNENCOHRISCTXVTWAWI')
labeled_df.meta_df

Unnamed: 0_level_0,dtype,var_type,role,isCount,weighting_var
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
state,object,categorical,"[independent, splitby]",False,
year,int64,ordinal,[independent],False,
driver_gender,object,categorical,"[independent, splitby]",False,
driver_race,object,categorical,"[independent, splitby]",False,
decriminalization,int64,categorical,[splitby],False,
medical,int64,categorical,[splitby],False,
recreational,int64,categorical,[splitby],False,
no_reforms,int64,False,[splitby],False,
search_conducted_false,float64,continuous,[ignore],True,
search_conducted_true,float64,continuous,[ignore],True,


# All_Pearson() Test

In [3]:
pearson = wg.All_Pearson()
pearson.get_trend_vars(labeled_df)
pearson.regression_vars

[('year', 'search_conducted_rate'),
 ('year', 'contraband_found_rate'),
 ('year', 'hit_rate'),
 ('year', 'num_stops')]

In [4]:
pearson.symmetric_vars

False

In [5]:
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')

[('year', 'search_conducted_rate', -0.03902973146563629, ''),
 ('year', 'contraband_found_rate', 0.041129162824156094, ''),
 ('year', 'hit_rate', 0.31619623383297996, ''),
 ('year', 'num_stops', -0.04396500060379762, '')]

In [6]:
pearson.compute_correlation_table_V2(labeled_df.df)

[('year', 'search_conducted_rate', -0.039029731465636305, ''),
 ('year', 'contraband_found_rate', 0.04112916282415568, ''),
 ('year', 'hit_rate', 0.3161962338329784, ''),
 ('year', 'num_stops', -0.043965000603797585, '')]

# compute_correlation_table

In [7]:
# source code
%timeit -r10 pearson.compute_correlation_table(labeled_df.df, 'agg_trend')

2.55 ms ± 437 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [8]:
# optimized code
%timeit -r10 pearson.compute_correlation_table_V2(labeled_df.df)

1.42 ms ± 92.6 µs per loop (mean ± std. dev. of 10 runs, 1000 loops each)


# get_subgroup_trends_1lev

In [11]:
# source code
%timeit -r10 labeled_df.get_subgroup_trends_1lev([pearson])

3.8 ms ± 75.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [7]:
pearson_2 = wg.trends.All_Pearson_V2()
pearson_2.get_trend_vars(labeled_df)
pearson_2.regression_vars

[('year', 'search_conducted_rate'),
 ('year', 'contraband_found_rate'),
 ('year', 'hit_rate'),
 ('year', 'num_stops')]

In [8]:
# optimized code
%timeit -r10 labeled_df.get_subgroup_trends_1lev([pearson_2])

4.03 ms ± 131 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)


# get_pairwise_trends_1lev

In [29]:
%timeit -r10 labeled_df.get_pairwise_trends_1lev([pearson])

171 ms ± 4.33 ms per loop (mean ± std. dev. of 10 runs, 10 loops each)


In [30]:
%timeit -r10 labeled_df.get_pairwise_trends_1lev([pearson_2])

268 ms ± 14.3 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


# get_trends

In [9]:
# source code
%timeit -r10 pearson.get_trends(labeled_df.df, 'agg_trend')

4.12 ms ± 191 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [10]:
# optimized code
%timeit -r10 pearson_2.get_trends(labeled_df.df, 'agg_trend')

3.65 ms ± 63.5 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [34]:
# source code
%timeit -r10 pearson.get_trends(labeled_df.df, 'subgroup_trend')

4.09 ms ± 74.6 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)


In [36]:
# optimized code
%timeit -r10 pearson_2.get_trends(labeled_df.df, 'subgroup_trend')

3.71 ms ± 99.9 µs per loop (mean ± std. dev. of 10 runs, 100 loops each)
