In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
plt.rc('font',family='Times New Roman', size=20)

In [None]:
results_dir = 'results/'

In [None]:
files = os.listdir(results_dir)
files = [f for f in files if '.csv' in f]
files = [f for f in files if not 'test.csv' in f]
files

In [None]:
df = pd.DataFrame()

for f in files:
    df_tmp = pd.read_csv(results_dir+f, index_col=False)
    s_tmp = f.split('_')
    df_tmp['data'] = s_tmp[0]
    df_tmp['c'] = s_tmp[1][1:-4]
    df = pd.concat([df, df_tmp])

df = df.drop(['Unnamed: 0'], axis=1).reset_index(drop=True)
df = df.rename({'fn_rates': 'fn_rate'}, axis=1)

rwr = [0]*len(df)
for i in range(len(df)):
    if df.loc[i, 'state'] == 'RWR':
        rwr[i] = 1
df['rwr'] = rwr

df

In [None]:
df_tmp = df[df['caipi_iter'] <= 50]

df_tmp = df_tmp.drop(['caipi_iter', 'state', 'labeled_size', 'unlabeled_size', 'precision', 'recall', 'accuracy'], axis=1)

df_tmp_group = df_tmp.groupby(['exp_iter', 'data', 'c']).agg({'fp_rate': 'min',
                                                              'fn_rate': 'min',
                                                              'corr_ces_pos': 'max',
                                                              'corr_ces_neg': 'max',
                                                              'rwr': 'sum'}) 

df_tmp_group.groupby(['data', 'c']).agg(['mean', 'std']).round(4)

In [None]:
df_tmp = df[df['caipi_iter'] == 50]

df_tmp = df_tmp.drop(['exp_iter', 'rwr', 'caipi_iter', 'state', 'labeled_size', 'unlabeled_size', 'precision', 'recall', 'accuracy'], axis=1)
df_tmp.groupby(['data', 'c']).agg(['mean', 'std']).round(4)

In [None]:
df_benchmark = pd.read_excel(results_dir+'benchmark.xlsx')

exp_names = []
for i in range(len(df_benchmark)):
    name = df_benchmark.loc[i, 'experiment'].split('_')[0]
    exp_names.append(name)
df_benchmark['experiment'] = exp_names

df_benchmark = df_benchmark.drop(['precision', 'recall', 'accuracy'], axis=1)

df_benchmark.groupby(['experiment']).agg(['mean', 'std'])

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=350)

plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['fp_rate'], 
         '-o', alpha=0.5, color='tab:orange', label='c=5')
plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '0') & (df['caipi_iter'] <= 50)]['fp_rate'], 
         '-o', alpha=0.5, color='tab:blue', label='c=0')

plt.hlines(0.15324, 1, 51, color='tab:gray', linestyle='dashed', label='benchmark')

for i in range(0, 50):
    if df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['rwr'].iloc[i] == 1:
        plt.vlines(i+1, 0, df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['fp_rate'].iloc[i],
                  linestyle='dashed', color='moccasin')

plt.legend(['c=5', 'c=0', 'benchmark', 'RWR iteration of c=5'])

plt.xlabel('iterations')
plt.ylabel('false positive rate')

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=350)

plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['fn_rate'], 
         '-o', alpha=0.5, color='tab:orange', label='c=5')
plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '0') & (df['caipi_iter'] <= 50)]['fn_rate'], 
         '-o', alpha=0.5, color='tab:blue', label='c=0')

plt.hlines(0.068034, 1, 51, color='tab:gray', linestyle='dashed', label='benchmark')

for i in range(0, 50):
    if df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['rwr'].iloc[i] == 1:
        plt.vlines(i+1, 0, df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['fn_rate'].iloc[i],
                  linestyle='dashed', color='moccasin')

plt.legend(['c=5', 'c=0', 'benchmark', 'RWR iteration of c=5'])

plt.xlabel('iterations')
plt.ylabel('false negative rate')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=350)

plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['corr_ces_pos'], 
         '-o', alpha=0.5, color='tab:orange', label='c=5')
plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '0') & (df['caipi_iter'] <= 50)]['corr_ces_pos'], 
         '-o', alpha=0.5, color='tab:blue', label='c=0')

plt.hlines(0.57938, 1, 51, color='tab:gray', linestyle='dashed', label='benchmark')

for i in range(0, 50):
    if df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['rwr'].iloc[i] == 1:
        plt.vlines(i+1, 0, df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['corr_ces_pos'].iloc[i],
                  linestyle='dashed', color='moccasin')

plt.legend(['c=5', 'c=0', 'benchmark', 'RWR iteration of c=5'])

plt.xlabel('iterations')
plt.ylabel('ratio of correctly explained positive predictions')

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), dpi=350)

plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['corr_ces_neg'], 
         '-o', alpha=0.5, color='tab:orange', label='c=5')
plt.plot(range(1, 51), df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '0') & (df['caipi_iter'] <= 50)]['corr_ces_neg'], 
         '-o', alpha=0.5, color='tab:blue', label='c=0')

plt.hlines(0.29930, 1, 51, color='tab:gray', linestyle='dashed', label='benchmark')

for i in range(0, 50):
    if df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['rwr'].iloc[i] == 1:
        plt.vlines(i+1, 0, df[(df['data'] == 'credit') & (df['exp_iter'] == 1) & (df['c'] == '5') & (df['caipi_iter'] <= 50)]['corr_ces_neg'].iloc[i],
                  linestyle='dashed', color='moccasin')

plt.legend(['c=5', 'c=0', 'benchmark', 'RWR iteration of c=5'])

plt.xlabel('iterations')
plt.ylabel('ratio of correctly explained negative predictions')

plt.show()