# Specifications

Execute a single ALMSER experiment

In [1]:
import os
path = "../datasets/lspc_computers/feature_vector_files/"
output_path = "../datasets/lspc_computers/almser/"
fv_splitter = "_"

# Active Learning Settings
max_queries =30
runs = 1
query_strategy = 'almser_gb' #almser_gb, uncertainty, disagreeement, almser_group, random
files = os.listdir(path)
files =[f.replace('.csv','') for f in files]

# Passive Learning Results

In [2]:
from learningutils import *
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter 

pairs_fv_train= pd.read_csv(output_path+"train_pairs_fv.csv")
pairs_fv_test= pd.read_csv(output_path+"test_pairs_fv.csv")

metadata_columns = ['source_id','target_id','pair_id', 'agg_score','source','target', 'label']
train_X = pairs_fv_train.drop(metadata_columns, axis=1)
train_y = pairs_fv_train['label']

test_X = pairs_fv_test.drop(metadata_columns, axis=1)
test_y = pairs_fv_test['label']


model = getClassifier('rf')
model.fit(train_X,train_y)
predictions = model.predict(test_X)
prec, recall, fscore, support  = precision_recall_fscore_support(test_y, predictions, average='binary')

print("Passive learing results: %f P, %f R, %f F1" % (prec,recall,fscore))

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/lspc_computers/almsertrain_pairs_fv.csv'

# Load the stored files and start ALMSER

In [None]:
from scoreaggregation import *
from ALMSER import *
from ALMSER_EXP import *
from ALMSER_log import *

almser_path = output_path
print(almser_path)

pairs_fv_train= pd.read_csv(almser_path+"train_pairs_fv.csv")
pairs_fv_train['datasource_pair'] = pairs_fv_train['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_train['target'].str.rsplit('_', 1).str[0]

pairs_fv_test= pd.read_csv(almser_path+"test_pairs_fv.csv")
pairs_fv_test['datasource_pair'] = pairs_fv_test['source'].str.rsplit('_', 1).str[0]+fv_splitter+pairs_fv_test['target'].str.rsplit('_', 1).str[0]

if(query_strategy == 'almser_group'):
    try:
        rltd = pd.read_csv(almser_path+"/heatmap.csv", index_col=0)
        #rltd = pd.read_csv(almser_path+"/task_relatedness.csv", index_col=0)
    except:
        print("ALMSERgroup query strategy needs a relatedness/ heatmap .csv file. Please check.")
else: rltd =None

all_nodes_test_match = set(pairs_fv_test[pairs_fv_test.label]['source'].values)
all_nodes_test_match.update(set(pairs_fv_test[pairs_fv_test.label]['target'].values))

all_nodes_train_match = set(pairs_fv_train[pairs_fv_train.label]['source'].values)
all_nodes_train_match.update(set(pairs_fv_train[pairs_fv_train.label]['target'].values))

#print("Intersection:", all_nodes_train_match.intersection(all_nodes_test_match))


unique_source_pairs = files
results_concat = pd.DataFrame()
results_all = pd.DataFrame()

for run in range(runs):
    print("RUN %i" % run)
    almser_exp = ALMSER_EXP(pairs_fv_train, pairs_fv_test, unique_source_pairs, max_queries, 'rf',
                        query_strategy, fv_splitter,rltd, bootstrap=True)
    
    almser_exp.run_AL()
    
    results_concat= pd.concat((results_concat,(almser_exp.results[['P_model','R_model','F1_model_micro','F1_model_macro','F1_model_micro_boot','F1_model_micro_boost_graph','F1_model_macro_boost_graph']])))

results_concat_by_row_index = results_concat.groupby(results_concat.index)
results_concat_mean =results_concat_by_row_index.mean(numeric_only=False) 
results_concat_std =results_concat_by_row_index.apply(np.std)


results_all['P'] = results_concat_mean['P_model']
results_all['P_std'] = results_concat_std['P_model']
results_all['R'] = results_concat_mean['R_model']
results_all['R_std'] = results_concat_std['R_model']
results_all['F1_micro'] = results_concat_mean['F1_model_micro']
results_all['F1_micro_std'] = results_concat_std['F1_model_micro']
results_all['F1_macro'] = results_concat_mean['F1_model_macro']
results_all['F1_macro_std'] = results_concat_std['F1_model_macro']
results_all['F1_micro_boot'] = results_concat_mean['F1_model_micro_boot']
results_all['F1_micro_boot_std'] = results_concat_std['F1_model_micro_boot']
results_all['F1_model_micro_boost_graph'] = results_concat_mean['F1_model_micro_boost_graph']
results_all['F1_model_micro_boost_graph_std'] = results_concat_std['F1_model_micro_boost_graph']
results_all['F1_model_macro_boost_graph'] = results_concat_mean['F1_model_macro_boost_graph']
results_all['F1_model_macro_boost_graph_std'] = results_concat_std['F1_model_macro_boost_graph']


In [None]:
#write results
from datetime import datetime

now = datetime.now()
timestamp= now.strftime("%d_%m_%H_%M")
filename = "%i_runs_%i_iter_%s_%s" %(runs,max_queries,query_strategy,timestamp)

#log files
almser_exp.results.to_csv(output_path+filename+"_ALL.csv", index=False)
almser_exp.labeled_set.to_csv(output_path+filename+"_LABELED_SET_INFO.csv", index=False)
almser_exp.informants_eval.to_csv(output_path+filename+"_INFORMANTS_EVAL.csv", index=False)
almser_exp.log.log_info.to_csv(output_path+filename+"_LOG_INFO.csv", index=False)

#actual results
results_all.to_csv(output_path+filename+".csv", index=False)
