In [2]:
import wiggum as wg
import pandas as pd
import numpy as np
import timeit
import seaborn as sns
import warnings
from itertools import chain
warnings.filterwarnings('ignore')

In [9]:
final=pd.DataFrame()

pearson = wg.All_Pearson()
pearson.get_trend_vars(labeled_df)
pearson.regression_vars
    
pearson_2 = wg.trends.All_Pearson_V2()
pearson_2.get_trend_vars(labeled_df)
pearson_2.regression_vars

objs=['pearson', 'pearson_2']
prereqs='from __main__ import labeled_df, pearson, pearson_2, objs, commands'
commands=["{}.compute_correlation_table(labeled_df.df, 'agg_trend')",
          'labeled_df.get_subgroup_trends_1lev([{}])',
         "{}.get_trends(labeled_df.df, 'agg_trend')",
         "{}.get_trends(labeled_df.df, 'sub_trend')"]

def test_scalability():
    times=[]
    for command in commands:
        for pearson_obj in objs:

            statement=command.format(pearson_obj)
            print(statement)
            t=timeit.repeat(statement, prereqs, repeat=10, number=100)
            times.append(t)
            print(t)
    timed = [ele for lis in times for ele in lis]
    return timed

def create_df(N, num_dep_indep, number_cluster, num_splitby):
    timings=test_scalability()
    function_names=['cct', 'get_subgroup_trends', 'get_trends (agg)', 'get_trends (sub)']
    result_df=pd.DataFrame(columns=['times', 'data', 'function'])
    # setting function column
    function=[]
    for ele in function_names:
        function.extend([ele]*20)
    # setting version column
    version=[['v1']*10, ['v2']*10]*len(function_names)
    version=list(chain.from_iterable(version))
    # setting trial column
    trial=list(range(10))*(len(function_names)*2)

    data_name='synthetic {}'.format(N)
            
    result_df['times']=timings
    result_df['function']=function
    result_df['data']=str(data_name)
    result_df['trial']=trial
    result_df['version']=version
    result_df['size']=N
    result_df['dep_indep_vars']=num_dep_indep
    result_df['number_cluster']=number_cluster
    result_df['num_splitby']=num_splitby
    
    return result_df


In [4]:
import mlsim
from mlsim import sp_plot

In [5]:
np.random.seed(20210627)


r_clusters =     [-.8, .5, .4, .7, -.6, .5, .4,-.9]  # magnitude correlation coefficient of clusters
cluster_spread = [ .3,-.2, .1,-.4, .2, -.1, .2,.4] # pearson correlation of means
p_sp_clusters =  [  1,  0,  0,  0,  1,   0,  0,  1] # portion of clusters with SP  #1 if r neg 0 if rpos
cluster_size =   [2,  3]#
domain_range = [0, 20, 0, 20] # of all data

In [11]:
n_list = [1000, 5000, 10000]
number_cluster_list=[2, 4, 8, 16, 32]
num_dep_indep_list = [4, 8, 16]
num_splitby_list = [4, 8, 16]

In [12]:
# generate data
for N in n_list:
    for number_cluster in number_cluster_list:
        for num_dep_indep in num_dep_indep_list:
            for num_splitby in num_splitby_list:
                k = [number_cluster]*40
                n_view = len(k)
                p_clusters = [[1/k_i]*k_i for k_i in k]
                many_sp_df = mlsim.geometric_indep_views_gmm_sp(n_view,r_clusters,cluster_size,cluster_spread,p_sp_clusters,
                            domain_range,k,N,p_clusters)
                # print(many_sp_df.shape)
                labeled_df = wg.LabeledDataFrame(many_sp_df)
                
                n_view = int(len(labeled_df.df.columns) / 3)

                # set dependent and independent for some xi, ignore for the rest
                dep_indep_list = np.random.choice(n_view*2, num_dep_indep)
                #print(dep_indep_list)
                roles = {'x'+str(i+1):['ignore'] if i not in dep_indep_list else ['independent','dependent'] 
                            for i in range(n_view*2)}

                # vars without 'x' in them are splitbys
                splitby_var_list = [cn for cn in labeled_df.df.columns if not('x' in cn)]
                # set splitby for some variable, ignore for the rest
                splitby_list = np.random.choice(splitby_var_list, num_splitby)
                roles.update( {c:['splitby'] if c in splitby_list else ['ignore'] for c in splitby_var_list})    

                count_list = []

                var_types = {'x'+str(i+1):'continuous' for i in range(n_view*2)}
                var_types.update( {c:'categorical' for c in splitby_var_list})
                weighting = {}

                labeled_df.set_counts(count_list)
                labeled_df.set_roles(roles)
                labeled_df.set_var_types(var_types)
                labeled_df.meta_df
                
                print('N:', N)
                print('Number of clusters:', number_cluster)
                print('Number of dep indep vars:', num_dep_indep)
                print('Number of sliptby vars:', num_splitby)
                
                result_df=create_df(N, num_dep_indep, number_cluster, num_splitby)
                final=pd.concat([final, result_df])
                del labeled_df



N: 5000
Number of clusters: 2
Number of dep indep vars: 4
Number of sliptby vars: 4
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.3607352000000219, 0.320321300000046, 0.310594100000003, 0.3078242000000273, 0.3189363000000185, 0.3074754000000439, 0.31456920000005084, 0.3150163999999904, 0.3869733000000224, 0.36477530000001934]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.31916729999989, 0.35681529999999384, 0.40308570000001964, 0.39563889999999446, 0.40087449999998626, 0.3793122999999241, 0.3843717000000879, 0.37638590000005934, 0.38574849999997696, 0.4095012999999881]
labeled_df.get_subgroup_trends_1lev([pearson])
[2.629111499999908, 1.8885291999999936, 1.8500917999999729, 1.565436200000022, 1.4103406000001542, 2.18812099999991, 1.3865743000001203, 1.6321749000001091, 1.5353920000000016, 1.3943429000000833]
labeled_df.get_subgroup_trends_1lev([pearson_2])
[1.460756100000026, 1.4177142999999433, 1.40242760000001, 1.4098344999999881, 1.43477589999

[0.8349505999999565, 0.8117045999999846, 0.8070617000000766, 0.8133920999998736, 0.820162400000072, 0.8252335999998195, 1.3504794000000402, 1.4047821999999996, 1.002007299999832, 0.9495448000000124]
N: 5000
Number of clusters: 2
Number of dep indep vars: 8
Number of sliptby vars: 8
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.5575630999999248, 0.5093664000000899, 0.5152537999999822, 0.5192026999998234, 0.5044585000000552, 0.5117448999999397, 0.5105356999999913, 0.5146466999999575, 0.5029078999998546, 0.5035864999999831]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.26799239999991187, 0.25189530000011473, 0.2667010999998638, 0.26071460000002844, 0.2605260999998791, 0.2531149999999798, 0.26286019999997734, 0.2624759999998787, 0.2611939999999322, 0.25603650000016387]
labeled_df.get_subgroup_trends_1lev([pearson])
[1.832949699999972, 1.4357465999999022, 1.443173999999999, 1.4694848999999977, 1.5371420999999827, 1.4329979999999978, 1.4811495999999806

[0.8341711999999006, 0.8181111000001238, 0.8233700000000681, 0.8250027999999929, 0.8129272000001038, 0.816960999999992, 0.8334683000000496, 0.8225607999997919, 0.8062989999998535, 0.8260271999999986]
N: 5000
Number of clusters: 2
Number of dep indep vars: 16
Number of sliptby vars: 16
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[1.130695199999991, 1.2358120999999755, 1.111195700000053, 1.1262132000001657, 1.1299432999999226, 1.1093439999999646, 1.1719874999998865, 1.1027662000001328, 1.1072219000000132, 1.097107699999924]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.27174489999993057, 0.2522459000001618, 0.2517737999999099, 0.2508241000000453, 0.2663121000000501, 0.2591701000001194, 0.25663250000002336, 0.24911629999996876, 0.26243270000009034, 0.2525225000001683]
labeled_df.get_subgroup_trends_1lev([pearson])
[3.1014213999999356, 1.4618739999998525, 1.4880298999999013, 1.4695550999999796, 1.4764001999999437, 1.4633627000000615, 1.460883199999898

[0.8200056000000586, 0.8264137000001028, 0.8035175999998501, 0.7970804999999928, 0.980796900000314, 0.838679499999671, 0.8231065999998464, 0.810002600000189, 0.8100847000000613, 0.8110664999999244]
N: 5000
Number of clusters: 4
Number of dep indep vars: 8
Number of sliptby vars: 4
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.3781493000001319, 0.330943699999807, 0.24954489999981888, 0.21338539999987916, 0.23285759999998845, 0.21418050000011135, 0.21131070000001273, 0.2093030999999428, 0.21311430000014298, 0.22865920000003825]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.27518849999978556, 0.252038900000116, 0.2469756000000416, 0.26223049999998693, 0.26148830000011003, 0.25398939999968206, 0.24911030000021128, 0.2667514999998275, 0.2784183999997367, 0.3209170999998605]
labeled_df.get_subgroup_trends_1lev([pearson])
[1.913803800000096, 1.3867794000002505, 1.3916581000003134, 1.3822105999997802, 1.4597054999999273, 1.3818077999999332, 1.38800050000

[0.8428475999999137, 0.813033699999778, 0.8396667000001798, 0.810958699999901, 0.8061122999997679, 0.8041828000000351, 0.8173965999999382, 0.8114906999999221, 0.8065486000000419, 0.7966208999996525]
N: 5000
Number of clusters: 4
Number of dep indep vars: 16
Number of sliptby vars: 8
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[1.257984000000306, 1.2506846000001133, 1.3971198999997796, 1.5044284999999036, 1.5114785999999185, 1.5189331000001403, 1.4634954000002836, 1.4747710999999981, 1.426517899999908, 1.244142600000032]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.26197959999990417, 0.24959230000013122, 0.2787220000000161, 0.3955063000003065, 0.893493000000035, 0.25035479999996824, 0.25428580000016154, 0.2569058000003679, 0.258691799999724, 0.2501121999998759]
labeled_df.get_subgroup_trends_1lev([pearson])
[3.2560692000001836, 1.4866164000000026, 1.4958473000001504, 1.4827519999998913, 1.4578118999997969, 1.4953513999998904, 1.474395799999911, 1.

[0.911561200000051, 1.2239125999999487, 1.9983877999998185, 1.9742836999998872, 1.7995912000001226, 0.931612700000187, 0.8371200000001409, 0.9888916999998401, 0.8358060000000478, 0.8378709999997227]
N: 5000
Number of clusters: 8
Number of dep indep vars: 4
Number of sliptby vars: 16
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.2814879000002293, 0.2621110000000044, 0.25752650000003996, 0.2736912000000302, 0.2646012999998675, 0.26069369999959235, 0.2613216999998258, 0.26799149999988003, 0.26034260000005816, 0.25857879999966826]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.26111549999995987, 0.27059159999998883, 0.25452050000012605, 0.25418260000014925, 0.25218690000019706, 0.2684439999998176, 0.25414020000016535, 0.2533579999999347, 0.2519988999997622, 0.2642405999999937]
labeled_df.get_subgroup_trends_1lev([pearson])
[2.1294955999997, 1.4497431999998298, 1.408042099999875, 1.4321492999997645, 1.4304859999997461, 1.437155999999959, 1.426752800000

[0.8360089000002517, 0.8344796999999744, 0.8238726999998107, 0.8138889000001654, 0.8148203000000649, 0.8332906000000548, 0.8167422000001352, 0.8563755999998648, 0.7976871999999275, 0.8264178999997966]
N: 5000
Number of clusters: 8
Number of dep indep vars: 16
Number of sliptby vars: 4
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.5116487000000234, 0.5059788999997181, 0.49489489999996295, 0.4902985999997327, 0.5040822999999364, 0.4939017000001513, 0.5036694999998872, 0.4977729000002, 0.5010505999998713, 0.5023989000001166]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.28159480000022086, 0.2543314000004102, 0.25648899999987407, 0.24637170000005426, 0.2604240000000573, 0.25016909999976633, 0.2504697999997916, 0.2550743000001603, 0.26114000000006854, 0.2598377000003893]
labeled_df.get_subgroup_trends_1lev([pearson])
[3.25219640000023, 1.4736594000000878, 1.4804801999998745, 1.4825498999998672, 1.4722727999997005, 1.4991011999995862, 1.455569700000069

[0.8299850000003062, 0.8175989999999729, 0.8223280000001978, 0.7985727999998744, 0.8208998000000065, 0.8490882000000965, 0.8388866999998754, 0.8109008000001268, 0.8072204000000056, 0.9900594000000638]
N: 5000
Number of clusters: 16
Number of dep indep vars: 4
Number of sliptby vars: 8
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.27549520000002303, 0.2567834000001312, 0.2616491000003407, 0.2738453000001755, 0.26291700000001583, 0.33375200000000405, 0.3806744999997136, 0.27095120000012685, 0.26395830000001297, 0.2568185000000085]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.2738273999998455, 0.26364990000001853, 0.25748079999993934, 0.25501959999974133, 0.2716700999999375, 0.25833920000013677, 0.2480291000001671, 0.2507838999999876, 0.2576038000001972, 0.25577430000021195]
labeled_df.get_subgroup_trends_1lev([pearson])
[2.0340989999999692, 1.4123509000000922, 1.4305282000000261, 1.4412343999997574, 1.4180876000000353, 1.4423658000000614, 1.405266

[0.8652262999994491, 0.8177494999999908, 0.8273000999997748, 0.8298248999999487, 0.8283562999995411, 0.8377485000000888, 0.8205385000001115, 0.8411501000000499, 0.8360941000000821, 0.8266878000003999]
N: 5000
Number of clusters: 16
Number of dep indep vars: 8
Number of sliptby vars: 16
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.8480878000000303, 1.0301546999999118, 1.0237092999996094, 1.112188299999616, 1.114511400000083, 0.8703042000006462, 0.9406103000001167, 0.9615524999999252, 0.5647877000001245, 0.5243650000002162]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.27091199999995297, 0.27148849999957747, 0.2699520999995002, 0.25526799999988725, 0.31728190000012546, 0.33691910000015923, 0.34946560000025784, 0.31036690000019007, 0.2548663999996279, 0.36661180000010063]
labeled_df.get_subgroup_trends_1lev([pearson])
[5.527088499999991, 1.5709233999996286, 1.5814614000000802, 1.5603167999997822, 1.566986700000598, 1.564447900000232, 1.572305300000

[0.8622074000004432, 0.8345836999997118, 0.8271455000003698, 0.8285334999991392, 0.8249795000001541, 0.8131290999999692, 0.8262650000006033, 0.7955646000000343, 0.820992799999658, 0.8283463999996457]
N: 5000
Number of clusters: 32
Number of dep indep vars: 4
Number of sliptby vars: 4
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[1.398452599999473, 1.4107033999998748, 1.3994570000004387, 1.508784800000285, 1.5089097000000038, 1.5137186000001748, 1.4304212999995798, 1.390903200000139, 1.384689299999991, 1.3951922999995077]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.26204890000008163, 0.2541090999993685, 0.24717940000027738, 0.277358400000594, 0.32273780000014085, 0.2727414000000863, 0.27922990000024583, 0.2632347999997364, 0.257999600000403, 0.2559776999996757]
labeled_df.get_subgroup_trends_1lev([pearson])
[2.064364399999249, 1.443255500000305, 1.4180919999998878, 1.4567164000000048, 1.4114564999999857, 1.4238844000001336, 1.4500538000002052, 1.4

[0.8489650999999867, 0.8271750000003522, 0.8213581999998496, 0.8376483000001826, 0.8192072000001644, 0.81181849999939, 0.8317784000000756, 0.8130037000000812, 0.8238332999999329, 0.846007200000713]
N: 5000
Number of clusters: 32
Number of dep indep vars: 8
Number of sliptby vars: 8
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.5055829000002632, 0.5025366999998369, 0.4998153000005914, 0.5082535999999891, 0.5078432000000248, 0.49464050000005955, 0.4934693999994124, 0.5021667999999408, 0.5044903999996677, 0.4983476999996128]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.2650635999998485, 0.24958650000007765, 0.26083440000002156, 0.26146889999927225, 0.25980600000002596, 0.25074839999979304, 0.2544578000006368, 0.26482840000062424, 0.2579571999995096, 0.2555191000001287]
labeled_df.get_subgroup_trends_1lev([pearson])
[5.419325899999421, 1.5416946999994252, 1.5492270999993707, 1.5734928000001673, 1.545038199999908, 1.5964953000002424, 1.88517120000051

[0.8676515000006475, 0.8310920000003534, 0.8426505999996152, 0.8339295000005222, 0.8143966999996337, 0.8538820000003398, 0.8807627000005596, 0.8305116000001362, 0.82417710000027, 0.8306006000002526]
N: 5000
Number of clusters: 32
Number of dep indep vars: 16
Number of sliptby vars: 16
pearson.compute_correlation_table(labeled_df.df, 'agg_trend')
[1.1492864000001646, 1.2598110999997516, 1.1851441999997405, 1.1231336999999257, 1.097325599999749, 1.1091678000002503, 1.1041100000002189, 1.1020846000001256, 1.1167273999999452, 1.0994791000002806]
pearson_2.compute_correlation_table(labeled_df.df, 'agg_trend')
[0.2610068999993018, 0.24732550000044284, 0.26709480000045005, 0.2558835999998337, 0.2511606000007305, 0.2484279999998762, 0.2634507999991911, 0.26085909999983414, 0.3160684000004039, 0.28274080000028334]
labeled_df.get_subgroup_trends_1lev([pearson])
[25.149144799999704, 2.724837199999456, 2.550408199999765, 2.9943702000000485, 2.2386397000000215, 2.1957727999997587, 2.217329699999936

In [147]:
final.to_csv('data/final_data')

In [22]:
df_1000=data[data['data']=='synthetic 1000']
df_10000=data[data['data']=='synthetic 10000']

In [24]:
final_result=pd.concat([df_1000, final, df_10000])

In [26]:
final_result.to_csv('data/final_data')