In [5]:
import numpy as np
import pandas as pd
import string
import wiggum as wg
import statistics
import time
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [6]:
global date_time
date_time = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")

In [7]:
def add_meta(data, random_seed, n_view, num_dep_indep, num_splitby, trial):
    np.random.seed(random_seed)
    labeled_df = wg.LabeledDataFrame(data)    
    
    # set dependent and independent for some xi, ignore for the rest
    dep_indep_list = np.random.choice(n_view*2, num_dep_indep)
    #print(dep_indep_list)
    roles = {'x'+str(i+1):['ignore'] if i not in dep_indep_list else ['independent','dependent'] 
             for i in range(n_view*2)}
    
    # vars without 'x' in them are splitbys
    splitby_var_list = [cn for cn in data.columns if not('x' in cn)]
    # set splitby for some variable, ignore for the rest
    splitby_list = np.random.choice(splitby_var_list, num_splitby)
    roles.update( {c:['splitby'] if c in splitby_list else ['ignore'] for c in splitby_var_list})    
    
    count_list = []

    var_types = {'x'+str(i+1):'continuous' for i in range(n_view*2)}
    var_types.update( {c:'categorical' for c in splitby_var_list})
    weighting = {}
    
    labeled_df.set_counts(count_list)
    labeled_df.set_roles(roles)
    labeled_df.set_var_types(var_types)
    labeled_df.meta_df

    # save metadata for tracking results
    data_size = len(data) 
    
    directory = '../data/scalability_test/'+ date_time
    if not(os.path.isdir(directory)):
        os.mkdir(directory)
    
    save_directory = directory +'/meta_' + str(data_size) + '_' + str(num_dep_indep) + '_' + str(num_splitby)      
    if not(os.path.isdir(save_directory)):
        os.mkdir(save_directory)

    meta_csv = 'meta' + str(trial) + '.csv'
    meta_file = os.path.join(save_directory, meta_csv)
    labeled_df.meta_df.to_csv(meta_file)    
    
    return labeled_df

In [8]:
def test_scalability(data, n_view, num_dep_indep, num_splitby, num_trial):
    temp_result = pd.DataFrame(columns=['size', 'num_dep_indep', 'number_splitby', 
                                        'cluster', 'trial', 'time'])
    
    random_seed_list = np.random.randint(100000, size=(num_trial))

    for i in range(num_trial):
        random_seed = random_seed_list[i]
        labeled_df = add_meta(data, random_seed, n_view, num_dep_indep, num_splitby, i)
        
        all_pearson_obj = wg.All_Pearson()
        
        # timing
        time = %timeit -or1 -n100 -q labeled_df.get_subgroup_trends_1lev([all_pearson_obj])

        row = {'size':len(data), 'num_dep_indep':num_dep_indep, 'number_splitby':num_splitby, 'cluster': 0,
                   'trial':i+1, 'time': time.timings[0]}

        temp_result = temp_result.append(row, ignore_index=True)

    return temp_result

# Running test

In [9]:
result = pd.DataFrame(columns=['size', 'num_dep_indep', 'number_splitby', 
                               'cluster', 'trial', 'time'])

data_size_list = [1000, 10000, 100000]
cluster_list = [2, 4, 8, 16, 32]
num_dep_indep_list = [4, 8, 16, 32]
num_splitby_list = [4, 8, 16, 32]
num_trial = 10

#data_size_list = [1000]
#cluster_list = [8]
#num_dep_indep_list = [4]
#num_splitby_list = [4]
#num_trial = 2

start_time = time.time()

for data_size in data_size_list:
    for cluster in cluster_list:      
        file = '../data/scalability_test/synthetic_scalability_' + str(data_size) + \
                    '_cluster' + str(cluster) + '.csv'
        data = pd.read_csv(file)
        
        n_view = int(len(data.columns) / 3)

        for num_dep_indep in num_dep_indep_list:
            for num_splitby in num_splitby_list:  
                
                temp_result = test_scalability(data, n_view, num_dep_indep, num_splitby, num_trial)
                temp_result["cluster"] = cluster
                
                result = result.append(temp_result)
                
print("--- %s seconds ---" % (time.time() - start_time))

--- 34714.08281707764 seconds ---


In [10]:
result

Unnamed: 0,size,num_dep_indep,number_splitby,cluster,trial,time
0,1000.0,4.0,4.0,2,1.0,0.012822
1,1000.0,4.0,4.0,2,2.0,0.010422
2,1000.0,4.0,4.0,2,3.0,0.011563
3,1000.0,4.0,4.0,2,4.0,0.010744
4,1000.0,4.0,4.0,2,5.0,0.010521
...,...,...,...,...,...,...
5,100000.0,32.0,32.0,32,6.0,3.070362
6,100000.0,32.0,32.0,32,7.0,2.385688
7,100000.0,32.0,32.0,32,8.0,1.715906
8,100000.0,32.0,32.0,32,9.0,2.004553


In [11]:
file = '../data/scalability_test/'+ date_time +'/result.csv'
result.to_csv(file ,index=False)

In [12]:
date_time

'2021_07_01-01_27_00_AM'