In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns


"""
Sample input
[SV][ETD], l:8, AC:0.92, SN:0.98, SP:0.91, PPV:0.72, NPV:1.00, F1:0.83, GM:0.944
[SV][ETD], l:7, AC:0.91, SN:0.98, SP:0.89, PPV:0.69, NPV:0.99, F1:0.81, GM:0.934
"""

'\nSample input\n[SV][ETD], l:8, AC:0.92, SN:0.98, SP:0.91, PPV:0.72, NPV:1.00, F1:0.83, GM:0.944\n[SV][ETD], l:7, AC:0.91, SN:0.98, SP:0.89, PPV:0.69, NPV:0.99, F1:0.81, GM:0.934\n'

In [112]:

def parse_etd_line(res_line):
    ''' Parse a ETD result line to get the level
    
    Parameters
    ----------
    res_line : a line of test data evaluation
    e.g. '[SV][ETD], l:1, AC:0.95, SN:0.83, SP:0.98,'+\
            'PPV:0.92, NPV:0.96, F1:0.87, GM:0.902'
            
    Returns
    -------
    level_id : integer level index
    gm : string gmean value 
    '''
    
    parts = res_line.rstrip().replace(' ','').split(',')
    level_id = int(parts[1].split(':')[1])
    gm = parts[8].split(':')[-1]
    return level_id, gm

# Create a temp .csv file from lines named fname
def create_csv_files(ml_lines, aml_lines, fname, verbose=False):
    csv_file = open(fname, mode='w')
    csv_file.write("Level,ML,AML\n")
    ml_id, aml_id = 0, 0
    ml_size = len(ml_lines)
    aml_size = len(aml_lines)
    
    while aml_id < aml_size:
        ml_line = ml_lines[ml_id].strip()
        aml_line = aml_lines[aml_id].strip()
        
        ml_level, ml_gm = parse_etd_line(ml_line)
        aml_level, aml_gm = parse_etd_line(aml_line)
        
        # both levels match
        if ml_level == aml_level: 
            if verbose:
                print(f'match \t{ml_level}, {aml_level}\t {ml_gm}, {aml_gm}')
            csv_file.write(f'{ml_level},{ml_gm},{aml_gm}\n')
            aml_id += 1
        # different coarsening !!!  #WARNING    
        elif ml_level > aml_level:
            if verbose:
                print('WARNING: Coarsening is different which is not expected!')
            csv_file.write(f'{ml_level},{ml_gm},NaN\n')
            # don't increment the aml_id
            
        # ml_level < aml_level which means aml is in next iteration
        elif ml_level == 1: 
            if verbose:
                print(f'aml is in next iteration {ml_level}, {aml_level}')
        
        # ml_level < aml_level, aml skipped this level
        else:
            if verbose:
                print(f'aml skipped the level {ml_level}, {aml_level}')
            csv_file.write(f'{ml_level},NaN,{aml_gm}\n')
            
        ml_id += 1
    
    # add the remaning of ml-svm results for skipped AML level
    while ml_id < ml_size:
        ml_line = ml_lines[ml_id].strip()
        ml_level, ml_gm = parse_etd_line(ml_line)
        csv_file.write(f'{ml_level},{ml_gm},NaN\n')
        ml_id +=1
        
#         if aml_id > 15: break
    csv_file.close()

In [84]:
aml_fname = 'buzz_x1_k10_eps0.001_r1_we0.001_mv2_rf2nd0_prs5000_prm1000_v0.1_s11_102319_1214.log'

In [118]:
ml_exp = 'buzz_x1_k10_eps0.0010_r1_we0.0010_mv2_rf2nd0_prs5000_prm1000_v0.1000_s11'

ml_fname = f'{ml_exp}_{ml_imp_part[ml_exp]}'
ml_grep = f"{grep_prefix} {ml_log_path}/{ml_fname} | grep 'ETD'"
                
ml_lines = os.popen(ml_grep).readlines()
ml_csv_fname = "ml_" + ml_exp + ".csv"

# aml_fname = f'{ml_exp}_{aml_imp_part[ml_exp]}'
aml_grep = f"{grep_prefix} {aml_log_path}/{aml_fname} | grep 'ETD'"

aml_lines = os.popen(aml_grep).readlines()

create_csv_files(ml_lines, aml_lines, ml_csv_fname, True)

df = pd.read_csv(ml_csv_fname, engine='python')

match 	7, 7	 0.949, 0.951
match 	6, 6	 0.882, 0.944
match 	5, 5	 0.938, 0.944
match 	4, 4	 0.939, 0.898
match 	3, 3	 0.834, 0.944
match 	2, 2	 0.762, 0.900
match 	1, 1	 0.788, 0.621
match 	7, 7	 0.954, 0.952
match 	6, 6	 0.936, 0.953
match 	5, 5	 0.705, 0.944
match 	4, 4	 0.787, 0.744
match 	3, 3	 0.863, 0.941
match 	2, 2	 0.934, 0.947
match 	1, 1	 0.946, 0.936
match 	7, 7	 0.561, 0.950
match 	6, 6	 0.940, 0.948
match 	5, 5	 0.931, 0.943
match 	4, 4	 0.703, 0.945
match 	3, 3	 0.944, 0.690
match 	2, 2	 0.943, 0.946
match 	1, 1	 0.735, 0.935
match 	7, 7	 0.948, 0.946
match 	6, 6	 0.930, 0.943
match 	5, 5	 0.889, 0.933
match 	4, 4	 0.936, 0.913
match 	3, 3	 0.927, 0.924
match 	2, 2	 0.927, 0.617
aml is in next iteration 1, 7
match 	7, 7	 0.927, 0.947
match 	6, 6	 0.940, 0.930
match 	5, 5	 0.857, 0.944
match 	4, 4	 0.817, 0.900
match 	3, 3	 0.430, 0.923
match 	2, 2	 0.947, 0.949
match 	1, 1	 0.725, 0.941
match 	7, 7	 0.952, 0.953
match 	6, 6	 0.943, 0.944
match 	5, 5	 0.677, 0.945
match 	4

In [114]:
df

Unnamed: 0,Level,ML,AML
0,7,0.949,0.951
1,6,0.882,0.944
2,5,0.938,0.944
3,4,0.939,0.898
4,3,0.834,0.944
5,2,0.762,0.900
6,1,0.788,0.621
7,7,0.954,0.952
8,6,0.936,0.953
9,5,0.705,0.944


In [116]:
ml_lines[-5:]

['[SV][ETD], l:5, AC:0.93, SN:0.68, SP:0.99, PPV:0.96, NPV:0.93, F1:0.80, GM:0.821\n',
 '[SV][ETD], l:4, AC:0.95, SN:0.78, SP:0.99, PPV:0.94, NPV:0.95, F1:0.85, GM:0.878\n',
 '[SV][ETD], l:3, AC:0.92, SN:0.97, SP:0.91, PPV:0.73, NPV:0.99, F1:0.83, GM:0.939\n',
 '[SV][ETD], l:2, AC:0.67, SN:0.97, SP:0.60, PPV:0.37, NPV:0.99, F1:0.54, GM:0.762\n',
 '[SV][ETD], l:1, AC:0.95, SN:0.83, SP:0.98, PPV:0.92, NPV:0.96, F1:0.87, GM:0.902\n']

In [117]:
aml_lines[-5:]

['[SV][ETD], l:1, AC:0.95, SN:0.86, SP:0.98, PPV:0.91, NPV:0.97, F1:0.88, GM:0.916\n',
 '[SV][ETD], l:7, AC:0.94, SN:0.97, SP:0.93, PPV:0.77, NPV:0.99, F1:0.86, GM:0.950\n',
 '[SV][ETD], l:6, AC:0.89, SN:0.98, SP:0.87, PPV:0.64, NPV:0.99, F1:0.78, GM:0.922\n',
 '[SV][ETD], l:5, AC:0.95, SN:0.96, SP:0.95, PPV:0.82, NPV:0.99, F1:0.88, GM:0.953\n',
 '[SV][ETD], l:4, AC:0.93, SN:0.98, SP:0.92, PPV:0.74, NPV:0.99, F1:0.84, GM:0.945\n']

In [None]:
# Path to logfiles
aml_log_path = "./logs/aml_svm_1.2.1/"
ml_log_path = "./logs/ml_svm_1.1.3/"


# Read log filenames in both locations
ml_files = os.popen("ls " + ml_log_path).readlines()
aml_files = os.popen("ls " + aml_log_path).readlines()

## to make it comparable, remove the date and time from the end of filename 

In [54]:
def get_divided_filename(filename):
    parts = filename.rstrip().split('_')
    key = '_'.join(parts[:12])
    val = '_'.join(parts[12:])
    return key, val

In [55]:
a = 'advertisement_x1_k10_eps0.001_r1_we0.001_mv2_rf2nd0_prs5000_prm1000_v0.1_s10_110719_1755.log\n'
get_divided_filename(a)


('advertisement_x1_k10_eps0.001_r1_we0.001_mv2_rf2nd0_prs5000_prm1000_v0.1_s10',
 '110719_1755.log')

In [42]:
def cmp_files(left_fname, right_fname):
    return get_imp_parts(left_fname) == get_imp_parts(right_fname)

In [56]:
ml_imp_part = dict()
for fname in ml_files:
    k, v = get_divided_filename(fname)
    ml_imp_part[k] = v

In [57]:
list(ml_imp_part.items())[:3]

[('advertisement_x1_k10_eps0.0010_r1_we0.0010_mv2_rf2nd0_prs5000_prm1000_v0.1000_s11',
  '110719_1755.log'),
 ('advertisement_x1_k10_eps0.0010_r1_we0.0100_mv2_rf2nd0_prs5000_prm1000_v0.1000_s11',
  '110719_1755.log'),
 ('advertisement_x1_k10_eps0.0010_r1_we0.0100_mv2_rf2nd0_prs5000_prm1000_v0.3000_s11',
  '110719_1755.log')]

In [64]:
aml_imp_part = dict()
for fname in aml_files:
    k, v = get_divided_filename(fname)
    aml_imp_part[k] = v

In [65]:
len(aml_imp_part.items())

5442

In [61]:
cmp_files('advertisement_x1_k10_eps0.001_r1_we0.001_mv2_rf2nd0_prs5000_prm1000_v0.1_s10', 
          'advertisement_x1_k10_eps0.001_r1_we0.001_mv2_rf2nd0_prs5000_prm1000_v0.1_s10_237498')

True

### make a set for faster look up

In [36]:
# a = '_'.join(ml_files[0].split('_')[:12])
# a
# Output : 
# 'advertisement_x1_k10_eps0.0010_r1_we0.0010_mv2_rf2nd0_prs5000_prm1000_v0.1000_s11'

In [71]:
print("grep -Poz '.*?UDSV.*?\\n(.+\\n)+?(\\n){0,1}"+\
                  "(.*?|[ \\/]+)Refinement(.*?|[ \\/]+)\\n' ")

grep -Poz '.*?UDSV.*?\n(.+\n)+?(\n){0,1}(.*?|[ \/]+)Refinement(.*?|[ \/]+)\n' 


In [70]:
verbose = False
# Start with ML-SVM logs

grep_prefix = "grep -Poz '.*?UDSV.*?\\n(.+\\n)+?(\\n){0,1}"+\
                  "(.*?|[ \\/]+)Refinement(.*?|[ \\/]+)\\n' "

for ml_exp in ml_imp_part:
    # Check for corresponding AML-SVM log file
        
    if ml_exp in aml_imp_part:
        
        ml_fname = f'{ml_exp}_{ml_imp_part[ml_exp]}'
        if verbose:
            print(f'{ml_fname} matched')
        
        ml_grep = f"{grep_prefix} {ml_log_path}/{ml_fname} | grep 'ETD'"
                
        ml_lines = os.popen(ml_grep).readlines()
        ml_csv_fname = "ml_" + ml_exp + ".csv"
        #create_csv_file(ml_lines, ml_csv_fname)

        aml_fname = f'{ml_exp}_{aml_imp_part[ml_exp]}'
        aml_grep = f"{grep_prefix} {aml_log_path}/{aml_fname} | grep 'ETD'"
                
        aml_lines = os.popen(aml_grep).readlines()
#         aml_csv_fname = "aml_" + ml_exp + ".csv"
        #create_csv_file(aml_lines, aml_csv_fname)
        create_csv_files(ml_lines, aml_lines, ml_csv_fname)
        
        df = pd.read_csv(ml_csv_fname, engine='python')
#         df = df[['Type','Level','GM']]

        dd=pd.melt(df, id_vars=['Level'], 
                   value_vars=['ML', 'AML'], var_name='SVMs')
    
        plt.figure()
        bp = sns.boxplot(x='Level', y='value', width=0.8, data=dd, hue='SVMs', showfliers=True)
        bp.set_title(fname)
        bp.set_ylabel("Gmeans")

        # Remove intermediary .csv files created
        os.system("rm " + ml_csv_fname)
#         os.system("rm " + aml_csv_fname)

    else:
        # No corresponding log file was found, so do nothing (unless we still want to show those results anways)
        if verbose:
            print("No matching log file for: " + f'{ml_exp}')
        else:
            pass
