In [1]:
import numpy as np
import pandas as pd
import os
import glob
from analysis_helper_exp3 import *
from IPython.display import clear_output
%load_ext autoreload
%autoreload 2
        
iter_max=50
task_col=None
cluster_col='BT_0.4 ID'

run_threshold=0
hs_params, benchmark_params, custom_params = 3, 4, 1
hs_job_count = hs_params*107*11
benchmark_job_count = benchmark_params*107*11
custom_job_count = custom_params*107*11

root_dir = '../../../aldd_results/params_results_exp3_July_2_2020//params_results\\'
hs_dir = glob.glob(root_dir+'sampled_hyparams/*/*/*/*/')
custom_dir = glob.glob(root_dir+'custom_cbws/*/*/*/*/')
benchmark_dir = glob.glob(root_dir+'benchmarks/*/*/*/*/')

df_from_file = True

if df_from_file:
    cdf = pd.read_csv('./exp3/cdf.csv.gz')

In [2]:
if not df_from_file:
    all_96_hs, all_384_hs, all_1536_hs, all_df_hs, successful_jobs, failed_jobs = get_results(hs_dir, iter_max, task_col, cluster_col, run_threshold, True)

    print('----------------------------------------------------------------------------')
    print('HS Jobs:')
    print('Total jobs: {}'.format(hs_job_count))
    print('Failed jobs: {}'.format(len(failed_jobs)))
    print('Successful jobs: {}'.format(len(successful_jobs)))

    hs_unique = np.unique(["_".join(x.split('_')[0:2]) for x in successful_jobs])
    print('Total HS: {}'.format(hs_params))
    print('Successful HS: {}'.format(len(hs_unique)))

----------------------------------------------------------------------------
HS Jobs:
Total jobs: 3531
Failed jobs: 0
Successful jobs: 3531
Total HS: 3
Successful HS: 3


In [3]:
if not df_from_file:
    all_96_bm, all_384_bm, all_1536_bm, all_df_bm, successful_jobs, failed_jobs = get_results(benchmark_dir, iter_max, task_col, cluster_col, run_threshold, True)

    print('----------------------------------------------------------------------------')
    print('Benchmark Jobs:')
    print('Total jobs: {}'.format(benchmark_job_count))
    print('Failed jobs: {}'.format(len(failed_jobs)))
    print('Successful jobs: {}'.format(len(successful_jobs)))

    hs_unique = np.unique(["_".join(x.split('_')[0:2]) for x in successful_jobs])
    print('Total HS: {}'.format(benchmark_params))
    print('Successful HS: {}'.format(len(hs_unique)))

----------------------------------------------------------------------------
Benchmark Jobs:
Total jobs: 4708
Failed jobs: 0
Successful jobs: 4708
Total HS: 4
Successful HS: 24


In [4]:
if not df_from_file:
    all_96_cs, all_384_cs, all_1536_cs, all_df_cs, successful_jobs, failed_jobs = get_results(custom_dir, iter_max, task_col, cluster_col, run_threshold, True)

    print('----------------------------------------------------------------------------')
    print('Custom Jobs:')
    print('Total jobs: {}'.format(custom_job_count))
    print('Failed jobs: {}'.format(len(failed_jobs)))
    print('Successful jobs: {}'.format(len(successful_jobs)))

    hs_unique = np.unique(["_".join(x.split('_')[0:3]) for x in successful_jobs])
    print('Total HS: {}'.format(custom_params))
    print('Successful HS: {}'.format(len(hs_unique)))

----------------------------------------------------------------------------
Custom Jobs:
Total jobs: 1177
Failed jobs: 0
Successful jobs: 1177
Total HS: 1
Successful HS: 1


In [2]:
recompute_task_info=False
if recompute_task_info:
    task_names = [r.split('\\')[-2][:-6] for r in glob.glob('../datasets/pcba/*_cv_96/')]
    task_hit_dict = {}
    for task_col in task_names:
        task_df = pd.concat([pd.read_csv(x) for x in glob.glob('../datasets/pcba/{}_cv_96/unlabeled_*.csv'.format(task_col))])
        cpd_count = task_df.shape[0]
        hit_limit = task_df[task_col].sum()
        unique_hit_limit = task_df[task_df[task_col] == 1][cluster_col].unique().shape[0]
        task_hit_dict[task_col] = (hit_limit, unique_hit_limit, cpd_count)
else:
    import pickle
    with open('task_info_dict.pickle', 'rb') as handle:
        task_hit_dict = pickle.load(handle)

In [3]:
des_cols = ['hs_id', 'rf_id', 'max_iter', 'exploitation_hits', 'exploration_hits', 'total_hits',
            'total_unique_hits', 'total_batch_size', 'hs_group', 'task_col']

if not df_from_file:
    cdf = pd.concat([all_96_hs, all_96_bm, all_96_cs])

hit_limit_list = []
uhit_limit_list = []
cpd_count_list = []
for tcol in cdf['task_col'].tolist():
    a, b, c = task_hit_dict[tcol]
    hit_limit_list.append(a)
    uhit_limit_list.append(b)
    cpd_count_list.append(c)
    
cdf['hit_limit'] = hit_limit_list
cdf['unique_hit_limit'] = uhit_limit_list
cdf['cpd_count'] = cpd_count_list

---
# Summary per 10, 20, 30, 40, 50 iterations

In [4]:
def helper_agg(col):
    if col.name  in ['rf_id', 'task_col']:
        return '-'
    elif col.name in ['hs_id', 'hs_group']:
        return col.unique()[0]
    else:
        if '_std' in col.name:
            return col.std()
        else:
            return col.mean()

def get_last_iter_summary(results_df, iter_max, group_cols = ['hs_id', 'rf_id']):
    sdf1 = results_df[results_df['iter_num']==iter_max][des_cols]
    sdf1 = sdf1.groupby(group_cols).agg(helper_agg).sort_values('total_hits', ascending=False)
    sorted_hid_list = sdf1.index.tolist()

    sdf2 = results_df[results_df['iter_num']==iter_max][des_cols]
    sdf2 = sdf2[[c for c in sdf2.columns if ('_hits' in c or 'hs_id' in c or 'rf_id' in c)]]
    sdf2.columns = [c.replace('hits', 'std') for c in sdf2.columns]
    sdf2 = sdf2.groupby(group_cols).agg(helper_agg).loc[sorted_hid_list]

    sdf = pd.concat([sdf1, sdf2], axis=1)
    return sdf

In [5]:
cdf_without_inactives = cdf[cdf['rf_id'] != 'allinactive0']
x = get_last_iter_summary(cdf_without_inactives, 9010, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,124.229907,0.0,124.229907,65.233645,166.288651,75.708339
MABSelector_2,119.1,0.0,119.1,61.125234,156.482016,67.723501
ClusterBasedWCSelector_341,26.68785,90.093458,116.781308,68.445794,152.386886,82.626983
ClusterBasedWCSelector_609,0.0,103.291589,103.291589,82.908411,119.63412,103.874137
ClusterBasedWCSelector_55,0.0,93.659813,93.659813,82.534579,113.199313,107.314276
ClusterBasedWCSelector_custom_1,27.704673,46.95514,74.659813,58.816822,106.756361,84.646263
ClusterBasedRandom,0.0,17.931776,17.931776,17.892523,32.816781,32.719956
InstanceBasedRandom,0.0,14.091589,14.091589,14.028972,28.502458,28.321423


In [6]:
x = get_last_iter_summary(cdf_without_inactives, 9020, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,264.916822,0.0,264.916822,138.511215,345.194463,154.527613
MABSelector_2,250.838318,0.0,250.838318,126.51215,316.907155,131.890958
ClusterBasedWCSelector_341,59.546729,184.933645,244.480374,142.113084,306.301711,166.502264
ClusterBasedWCSelector_609,0.0,217.526168,217.526168,165.23271,242.726809,203.626396
ClusterBasedWCSelector_55,0.0,195.150467,195.150467,167.471028,227.27973,213.283731
ClusterBasedWCSelector_custom_1,75.709346,110.095327,185.804673,135.769159,243.521326,181.009879
ClusterBasedRandom,0.0,35.863551,35.863551,35.741121,65.790239,65.528615
InstanceBasedRandom,0.0,28.154206,28.154206,27.930841,56.750727,56.059894


In [7]:
x = get_last_iter_summary(cdf_without_inactives, 9030, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,389.485981,0.0,389.485981,204.550467,512.089996,227.711585
MABSelector_2,367.671963,0.0,367.671963,186.701869,462.224895,192.710201
ClusterBasedWCSelector_341,87.271028,269.742991,357.014019,208.042991,450.836411,245.133541
ClusterBasedWCSelector_609,0.0,319.35514,319.35514,236.503738,357.910126,294.608287
ClusterBasedWCSelector_custom_1,124.434579,167.378505,291.813084,206.369159,372.135554,268.886526
ClusterBasedWCSelector_55,0.0,282.976636,282.976636,239.835514,332.305494,310.55717
ClusterBasedRandom,0.0,53.726168,53.726168,53.448598,98.052375,97.458742
InstanceBasedRandom,0.0,42.156075,42.156075,41.633645,85.046275,83.465102


In [8]:
x = get_last_iter_summary(cdf_without_inactives, 9040, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,501.78972,0.0,501.78972,264.948598,670.734225,298.013222
MABSelector_2,472.374766,0.0,472.374766,240.400935,600.706185,249.861022
ClusterBasedWCSelector_341,110.138318,346.61215,456.750467,267.042056,584.146244,318.980821
ClusterBasedWCSelector_609,0.0,410.498131,410.498131,299.029907,466.90003,377.807244
ClusterBasedWCSelector_custom_1,170.302804,218.348598,388.651402,269.963551,493.236581,350.711835
ClusterBasedWCSelector_55,0.0,360.382243,360.382243,302.608411,427.893713,398.209095
ClusterBasedRandom,0.0,71.388785,71.388785,70.915888,130.345092,129.293324
InstanceBasedRandom,0.0,56.030841,56.030841,55.057009,113.463667,110.49388


In [9]:
x = get_last_iter_summary(cdf_without_inactives, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,604.433645,0.0,604.433645,320.63271,821.195904,365.942806
MABSelector_2,569.58785,0.0,569.58785,291.066355,733.889001,306.304361
ClusterBasedWCSelector_341,130.193458,417.948598,548.142056,321.11028,710.912387,388.984102
ClusterBasedWCSelector_609,0.0,493.204673,493.204673,354.290654,570.581205,453.304122
ClusterBasedWCSelector_custom_1,212.733645,264.552336,477.285981,327.690654,606.569295,425.84001
ClusterBasedWCSelector_55,0.0,430.511215,430.511215,358.271028,518.490409,478.47114
ClusterBasedRandom,0.0,89.419626,89.419626,88.7,162.782973,161.17838
InstanceBasedRandom,0.0,70.209346,70.209346,68.673832,141.748189,137.052485


---
# Per task activity ratio
### 0 <= actives_ratio <= 0.1

In [40]:
task_dfs = cdf_without_inactives[cdf_without_inactives['iter_num']==9050]
task_dfs['total_hits_log'] = np.log(task_dfs['total_hits'].values)

task_info = task_dfs[['task_col', 'hit_limit', 'unique_hit_limit', 'cpd_count']].drop_duplicates()
task_info['active_ratio'] = np.around(100.0 * task_info['hit_limit'] / task_info['cpd_count'], decimals=2)
task_info['hit_limit'] = task_info['hit_limit'].astype(int)

lower_thresh, upper_thresh = 0.0, 0.1
qualifying_tasks = task_info[(task_info['active_ratio'] >= lower_thresh) & (task_info['active_ratio'] <= upper_thresh)]['task_col'].tolist()
ldf = task_dfs[task_dfs['task_col'].isin(qualifying_tasks)]
x = get_last_iter_summary(ldf, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ClusterBasedWCSelector_341,0.120833,22.7,22.820833,16.6625,21.464118,15.155493
MABSelector_exploitive,22.6625,0.0,22.6625,16.129167,21.285238,14.662307
MABSelector_2,22.379167,0.0,22.379167,15.879167,21.010109,14.312632
ClusterBasedWCSelector_609,0.0,21.595833,21.595833,16.4875,17.942236,13.415155
ClusterBasedWCSelector_55,0.0,17.320833,17.320833,14.208333,13.107584,10.52027
ClusterBasedWCSelector_custom_1,9.641667,6.529167,16.170833,12.15,15.200748,10.617464
ClusterBasedRandom,0.0,3.816667,3.816667,3.808333,2.691948,2.682021
InstanceBasedRandom,0.0,2.683333,2.683333,2.6625,2.1178,2.091688


In [41]:
lower_thresh, upper_thresh = 0.1, 0.3
qualifying_tasks = task_info[(task_info['active_ratio'] >= lower_thresh) & (task_info['active_ratio'] <= upper_thresh)]['task_col'].tolist()
ldf = task_dfs[task_dfs['task_col'].isin(qualifying_tasks)]
x = get_last_iter_summary(ldf, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,125.365,0.0,125.365,86.86,71.637149,44.57054
MABSelector_2,121.19,0.0,121.19,84.4,71.863522,44.29481
ClusterBasedWCSelector_341,1.915,116.38,118.295,83.55,68.650509,43.294336
ClusterBasedWCSelector_609,0.0,115.55,115.55,84.405,67.645897,43.918277
ClusterBasedWCSelector_55,0.0,93.9,93.9,74.24,59.527621,43.744374
ClusterBasedWCSelector_custom_1,50.7,40.555,91.255,67.56,66.166608,43.907468
ClusterBasedRandom,0.0,14.95,14.95,14.905,5.462297,5.440032
InstanceBasedRandom,0.0,9.695,9.695,9.63,3.687814,3.641345


In [42]:
lower_thresh, upper_thresh = 0.3, 0.5
qualifying_tasks = task_info[(task_info['active_ratio'] >= lower_thresh) & (task_info['active_ratio'] <= upper_thresh)]['task_col'].tolist()
ldf = task_dfs[task_dfs['task_col'].isin(qualifying_tasks)]
x = get_last_iter_summary(ldf, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,260.84375,0.0,260.84375,166.81875,142.708285,88.279778
MABSelector_2,257.83125,0.0,257.83125,165.375,140.142615,86.113699
ClusterBasedWCSelector_341,6.68125,240.4375,247.11875,160.13125,136.771828,85.696636
ClusterBasedWCSelector_609,0.0,239.79375,239.79375,161.90625,142.456839,92.956526
ClusterBasedWCSelector_custom_1,118.8875,88.18125,207.06875,141.125,145.531346,94.722418
ClusterBasedWCSelector_55,0.0,200.55625,200.55625,149.1,122.27126,89.843484
ClusterBasedRandom,0.0,25.78125,25.78125,25.64375,8.112939,8.054333
InstanceBasedRandom,0.0,18.04375,18.04375,17.7875,5.855006,5.726895


In [43]:
lower_thresh, upper_thresh = 0.5, 0.7
qualifying_tasks = task_info[(task_info['active_ratio'] >= lower_thresh) & (task_info['active_ratio'] <= upper_thresh)]['task_col'].tolist()
ldf = task_dfs[task_dfs['task_col'].isin(qualifying_tasks)]
x = get_last_iter_summary(ldf, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_2,418.185714,0.0,418.185714,277.471429,188.926586,110.023413
MABSelector_exploitive,417.342857,0.0,417.342857,277.771429,181.750098,105.092943
ClusterBasedWCSelector_609,0.0,394.328571,394.328571,275.142857,176.830937,111.463166
ClusterBasedWCSelector_341,8.085714,385.271429,393.357143,269.014286,179.245442,108.755609
ClusterBasedWCSelector_custom_1,197.1,172.557143,369.657143,259.485714,176.479739,113.899872
ClusterBasedWCSelector_55,0.0,343.657143,343.657143,264.928571,149.334843,110.449847
ClusterBasedRandom,0.0,42.028571,42.028571,41.757143,9.373059,9.331939
InstanceBasedRandom,0.0,28.471429,28.471429,28.185714,6.240869,6.125009


In [44]:
lower_thresh, upper_thresh = 0.7, 1.0
qualifying_tasks = task_info[(task_info['active_ratio'] >= lower_thresh) & (task_info['active_ratio'] <= upper_thresh)]['task_col'].tolist()
ldf = task_dfs[task_dfs['task_col'].isin(qualifying_tasks)]
x = get_last_iter_summary(ldf, 9050, ['hs_id'])
x.drop(['rf_id', 'max_iter', 'total_batch_size', 'task_col', 'hs_group', 'exploitation_std', 'exploration_std'], axis=1)

Unnamed: 0_level_0,exploitation_hits,exploration_hits,total_hits,total_unique_hits,total_std,total_unique_std
hs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MABSelector_exploitive,528.875,0.0,528.875,337.55,189.604878,118.022803
MABSelector_2,521.875,0.0,521.875,331.85,197.924511,122.66058
ClusterBasedWCSelector_609,0.0,506.275,506.275,345.175,201.319237,143.550918
ClusterBasedWCSelector_341,12.9,492.95,505.85,322.8,183.364783,118.412404
ClusterBasedWCSelector_custom_1,218.0,227.425,445.425,315.525,202.018137,150.554188
ClusterBasedWCSelector_55,0.0,432.425,432.425,331.825,182.634971,152.422134
ClusterBasedRandom,0.0,62.2,62.2,61.9,17.264459,17.099708
InstanceBasedRandom,0.0,41.75,41.75,41.225,7.638465,7.433422


---
# Still not done. Ignore lower results for now!
# Plot per Task Performance

## Total Hits

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context("paper")
sns.set(font_scale=1.5)

task_dfs = cdf[cdf['iter_num']==9999][des_cols + ['hit_limit', 'unique_hit_limit', 'cpd_count']]
task_dfs = task_dfs[task_dfs['max_iter'] == 49]
task_dfs['total_hits_log'] = np.log(task_dfs['total_hits'].values)

task_info = task_dfs[['task_col', 'hit_limit', 'unique_hit_limit', 'cpd_count']].drop_duplicates()
task_info['active_ratio'] = np.around(100.0 * task_info['hit_limit'] / task_info['cpd_count'], decimals=2)
task_info['hit_limit'] = task_info['hit_limit'].astype(int)

run_large_fig = False
if run_large_fig:
    figsize=(50, 16)

    for col in ['cpd_count', 'hit_limit', 'unique_hit_limit', 'active_ratio']:
        task_order = task_info.sort_values(col)['task_col'].tolist()
        task_values = task_info.sort_values(col)[col].values
        g = sns.catplot(x="task_col", y="total_hits", hue="hs_id", data=task_dfs,
                        order=task_order, kind="box", height=10, aspect=3#, scatter_kws={"s": 1}, linestyles='-', height=10, aspect=5.0
                        )
        g.set_xticklabels(rotation=90)
        lw = g.ax.lines[0].get_linewidth() # lw of first line
        plt.setp(g.ax.lines,linewidth=0.5) 
        g.set(xticklabels=task_values)
        plt.title('X-axis sorted by {}.'.format(col))

In [None]:
col = 'active_ratio'
sorted_df = task_info.sort_values(col)
task_order = sorted_df['task_col'].tolist()
task_values = sorted_df[col].values
hitlimit_values = sorted_df['hit_limit'].values
num_tasks = 107
batch_size = 10

for idx in range(int(np.ceil(num_tasks / batch_size))):
    c_order = task_order[idx*batch_size:(idx+1)*batch_size]
    c_values = task_values[idx*batch_size:(idx+1)*batch_size]
    
    c_df = task_dfs[task_dfs['task_col'].isin(c_order)]
    
    figsize=(22, 6)
    plt.figure(figsize=figsize)
    g = sns.boxplot(x="task_col", y="total_hits", hue="hs_id", data=c_df,
                    order=c_order, width=0.5)
    g.set_xticklabels(c_values, rotation=90)
    lw = g.lines[0].get_linewidth() # lw of first line
    plt.setp(g.lines,linewidth=0.5) 
    g.set(xticklabels=c_values)
    plt.title('X-axis sorted by {}.'.format(col))
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
    
    sns.lineplot(x="task_col", y="total_hits", hue="hs_id", data=c_df,
                 order=c_order, width=0.5)
    plt.show()

In [None]:
col = 'active_ratio'
sorted_df = task_info.sort_values(col)
task_order = sorted_df['task_col'].tolist()
task_values = sorted_df[col].values
hitlimit_values = sorted_df['hit_limit'].values
num_tasks = 107
batch_size = 10

for idx in range(int(np.ceil(num_tasks / batch_size))):
    c_order = task_order[idx*batch_size:(idx+1)*batch_size]
    c_values = task_values[idx*batch_size:(idx+1)*batch_size]
    c_hvalues = hitlimit_values[idx*batch_size:(idx+1)*batch_size]
    
    c_df = task_dfs[task_dfs['task_col'].isin(c_order)]
    
    figsize=(22, 6)
    plt.figure(figsize=figsize)
    g = sns.boxplot(x="task_col", y="total_hits", hue="hs_id", data=c_df,
                    order=c_order, width=0.5)
    g.set_xticklabels(c_hvalues, rotation=90)
    lw = g.lines[0].get_linewidth() # lw of first line
    plt.setp(g.lines,linewidth=0.5) 
    g.set(xticklabels=c_hvalues)
    plt.title('X-axis sorted by {}.'.format(col))
    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
    plt.show()

## Total Hits Log-scale

In [None]:
figsize=(50, 16)

for col in ['cpd_count', 'hit_limit', 'unique_hit_limit', 'active_ratio']:
    task_order = task_info.sort_values(col)['task_col'].tolist()
    task_values = task_info.sort_values(col)[col].values
    task_dfs['total_hits_log'] = np.log(task_dfs['total_hits'].values)
    g = sns.catplot(x="task_col", y="total_hits_log", hue="hs_id", kind="point", data=task_dfs,
                    order=task_order, scatter_kws={"s": 1},
                    linestyles='-', height=10, aspect=5.0)
    g.set_xticklabels(rotation=90)
    lw = g.ax.lines[0].get_linewidth() # lw of first line
    plt.setp(g.ax.lines,linewidth=0.5) 
    g.set(xticklabels=task_values)
    plt.title('X-axis sorted by {}.'.format(col))

## Total Unique Hits

In [None]:
figsize=(50, 16)

for col in ['hit_limit', 'unique_hit_limit', 'active_ratio']:
    task_order = task_info.sort_values(col)['task_col'].tolist()
    task_values = task_info.sort_values(col)[col].values
    g = sns.catplot(x="task_col", y="total_unique_hits", hue="hs_id", kind="point", data=task_dfs,
                    order=task_order, scatter_kws={"s": 1},
                    linestyles='-', height=10, aspect=5.0)
    g.set_xticklabels(rotation=90)
    lw = g.ax.lines[0].get_linewidth() # lw of first line
    plt.setp(g.ax.lines,linewidth=0.5) 
    g.set(xticklabels=task_values)
    plt.title('X-axis sorted by {}.'.format(col))

---

# Win Comparisons

In [None]:
def get_win_mat(task_dfs, metric):
    hs_ids = task_dfs['hs_id'].unique()
    task_cols = task_dfs['task_col'].unique()
    comp_mat = np.zeros((len(hs_ids), len(hs_ids), len(task_cols)))

    for task_i in range(len(task_cols)):
        curr_task_df = task_dfs[task_dfs['task_col'] == task_cols[task_i]]
        curr_task_df.index = curr_task_df['hs_id']
        for hs_i in range(len(hs_ids)):
            value_i = curr_task_df.loc[hs_ids[hs_i], metric]
            for hs_j in range(len(hs_ids)):
                value_j = curr_task_df.loc[hs_ids[hs_j], metric]

                if value_i > (1.1*value_j):
                    comp_mat[hs_i, hs_j, task_i] = 1
    
    return comp_mat

comp_mat = get_win_mat(task_dfs, metric='total_hits')
toplist= []
for task_i in range(len(task_cols)):
    winsum = comp_mat[:,:,task_i].sum(axis=1)
    if np.sum(winsum) > 0:
        toplist.append(hs_ids[np.argsort(winsum)[::-1]][0])

u, c = np.unique(toplist, return_counts=True)
u, c, np.sum(c)

In [None]:
comp_mat = get_win_mat(task_dfs, metric='total_unique_hits')
                
toplist= []
for task_i in range(len(task_cols)):
    winsum = comp_mat[:,:,task_i].sum(axis=1)
    if np.sum(winsum) > 0:
        toplist.append(hs_ids[np.argsort(winsum)[::-1]][0])
    
u, c = np.unique(toplist, return_counts=True)
u, c, np.sum(c)