In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn

import matplotlib.pyplot as plt
import numpy as np

import pandas as pd

# Use Confusion matrix for computing Prec/Recall/Count of predictions (TP/FP)

In [None]:
def show_confusionmatrix(y_true,y_pred):

    classes = tuple(set(y_pred))

    if len(classes) < 4 and max(classes)==3 :
      classes = (0,1,2,3)

    if len(classes) < 3 and max(classes) < 3:
      classes = (0,1,2)
    # classes = (0,1,2)

    cf_matrix = confusion_matrix(y_true, y_pred)
    df_cm_recall = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=1)[:, None], index = [i for i in classes],
                        columns = [i for i in classes])

    df_cm_prec = pd.DataFrame(cf_matrix / np.sum(cf_matrix, axis=0)[None,:], index = [i for i in classes],
                        columns = [i for i in classes])

    flatten_df_cm_recall = df_cm_recall.to_numpy().flatten().tolist()
    flatten_df_cm_prec = df_cm_prec.to_numpy().flatten().tolist()
    flatten_cf_matrix = cf_matrix.flatten().tolist()

    return flatten_df_cm_recall, flatten_df_cm_prec,  flatten_cf_matrix, len(classes)

# Extract Label in COT settings

In [None]:
def label_to_num(text_label):
    '''
    convert label 0.1.2. to text-based label
    '''
    if 'passive' in text_label.lower():
        label = 0
    elif 'active' in text_label.lower():
        label = 1
    elif 'constructive' in text_label.lower() :
        label = 2
    else:
        print('Undefined label found: ', text_label)
        label = 3

    return label


def extract_pred_label_COT(response):
  '''
  When we use the following format (used in pureFew shot)
   "Question:
    "Statement:
    "Label:
    "Chain-of-thought:
  '''
  splitted_res = response.splitlines()

  label_num = 3 #undefined
  for text in splitted_res:
    if "label:" in text.lower():
      label_num = label_to_num(text)
      break

  return label_num

# Accruacy

In [None]:
def accuracy(y_true,y_pred):
    return sum(1 for x,y in zip(y_true,y_pred) if x == y) / float(len(y_true))

# Output to csv for Prec/Recall/Accruacy

In [None]:
from itertools import compress
def create_dataframe(list_2d, num_cls_list, output_csv, gpt_models_all, prompt_vers_all):


    data_with_uncategorized = list(compress(list_2d, [k==4 for k in num_cls_list]))
    model_uncateg = list(compress(gpt_models_all, [k==4 for k in num_cls_list]))
    promt_uncateg = list(compress(prompt_vers_all, [k==4 for k in num_cls_list]))

    df_uncateg = pd.DataFrame(data_with_uncategorized, columns=['0_0','0_1','0_2','0_3','1_0','1_1','1_2','1_3','2_0','2_1','2_2','2_3','3_0','3_1','3_2','3_3'])
    df_uncateg.insert(0,'model', model_uncateg)
    df_uncateg.insert(1,'prompt', promt_uncateg)

    data_without_uncategorized = list(compress(list_2d, [k==3 for k in num_cls_list]))
    model_without_uncateg = list(compress(gpt_models_all, [k==3 for k in num_cls_list]))
    promt_without_uncateg = list(compress(prompt_vers_all, [k==3 for k in num_cls_list]))

    df_no_uncateg = pd.DataFrame(data_without_uncategorized, columns=['0_0','0_1','0_2','1_0','1_1','1_2','2_0','2_1','2_2'])
    df_no_uncateg.insert(0,'model', model_without_uncateg)
    df_no_uncateg.insert(1,'prompt', promt_without_uncateg)

    df = pd.concat([df_uncateg, df_no_uncateg], ignore_index=True, sort=False)
    df.to_csv(output_csv, index=False)
    return df

def create_dataframe_acc(acc_list, num_class_all, output_csv, gpt_models_all, prompt_vers_all):

    df = pd.DataFrame(acc_list, columns=['accuracy'])
    df.insert(0,'model', gpt_models_all)
    df.insert(1,'prompt', prompt_vers_all)
    df['num_cls'] = num_class_all
    df.to_csv(output_csv, index=False)
    return df

# F1 score and simplified prec/recall csv
Micro F1 Average = Accruacy

Macro F1 Average = Average of per-class F1 score

Weighted F1 Average = Sum of (per-class F1 score * (actual occurences of the (true) class in the dataset/ total test data))

In [None]:
def compute_f1(precision_csv, recall_csv, count_csv, output_f1_csv, output_prec_recall_simple_csv, num_class = 3):
  df_prec = pd.read_csv(precision_csv)
  df_recall = pd.read_csv(recall_csv)

  df_count = pd.read_csv(count_csv)


  perclass_f1_df = df_prec.loc[:, ['model','prompt']]

  prec_recall_df =  df_prec.loc[:, ['model','prompt']]

  all_count = df_count.drop(['model','prompt'], axis=1).sum(axis=1)

  col_perclass_f1_list = []
  col_weighted_f1_list = []
  for k in range(num_class):

    # Get precision and Recall [0_0], [1_1], [2_2]===============
    col_name = '{0}_{1}'.format(k,k)


    prec = df_prec.loc[:, [col_name]]
    recall = df_recall.loc[:, [col_name]]

    prec_recall_df[str(k)+'_prec'] = prec
    prec_recall_df[str(k)+'_recall'] = recall


    #per class f1===========================
    perclass_f1 = 2*prec*recall / (prec+recall)

    col_name_f1 = '{0}_perclass_f1'.format(k)
    col_perclass_f1_list.append(col_name_f1)

    perclass_f1_df[col_name_f1] = perclass_f1

    #weighted f1===========================

    col_list = []
    for col in df_count.columns:
      if col == 'model' or col == 'prompt':
        continue
      true_class = col[0]

      if int(true_class) == k:
        col_list.append(col)

    num_data_per_class = df_count[col_list].sum(axis=1)
    proportion= num_data_per_class / all_count

    col_name_weighted = '{0}_weighted_f1'.format(k)
    col_weighted_f1_list.append(col_name_weighted)

    perclass_f1_df[col_name_weighted] =  perclass_f1_df[col_name_f1] *  proportion

  # Average Skip NA ==========================
  perclass_f1_df['macro_average_skipNA'] =  perclass_f1_df[col_perclass_f1_list].mean(axis=1)
  perclass_f1_df['weighted_average_skipNA'] =  perclass_f1_df[ col_weighted_f1_list].sum(axis=1)


  # Average replace NA with 0==========================
  perclass_f1_replace_na_df =  perclass_f1_df.fillna(0)
  perclass_f1_df['macro_average_NAas0'] =  perclass_f1_replace_na_df[col_perclass_f1_list].mean(axis=1)
  perclass_f1_df['weighted_average_NAas0'] =  perclass_f1_replace_na_df[ col_weighted_f1_list].sum(axis=1)


  perclass_f1_df.to_csv(output_f1_csv, index=False)
  prec_recall_df.to_csv(output_prec_recall_simple_csv, index=False)

  return perclass_f1_df, prec_recall_df








Just for testing

In [None]:
# out_dir = '/content/drive/MyDrive/Classification Model/COT/Evaluation/updated_test_wquestion_9_13_23_preds/updated_test_wquestion_9_13_23_eval_metrics/'

out_dir = '/content/drive/MyDrive/Classification Model/COT/COT refinement 9_14_2023/Evaluation_ASST1/updated_test_wquestion_9_13_23_ASST1_preds/'

prec_csv = out_dir + 'prec_all.csv'
recall_csv = out_dir + 'recall_all.csv'
count_csv = out_dir + 'count_all.csv'

output_f1_csv = out_dir + 'f1_score.csv'
output_prec_recall_simple_csv =  out_dir + 'prec_recall_simple.csv'

compute_f1(prec_csv, recall_csv, count_csv, output_f1_csv, output_prec_recall_simple_csv, num_class=3)



(   model prompt  0_perclass_f1  0_weighted_f1  1_perclass_f1  1_weighted_f1  \
 0  gpt-4  1a_v2       0.728682       0.383233       0.701493       0.306578   
 
    2_perclass_f1  2_weighted_f1  macro_average_skipNA  \
 0            NaN            NaN              0.715087   
 
    weighted_average_skipNA  macro_average_NAas0  weighted_average_NAas0  
 0                 0.689811             0.476725                0.689811  ,
    model prompt    0_prec  0_recall    1_prec  1_recall  2_prec  2_recall
 0  gpt-4  1a_v2  0.810345  0.661972  0.626667   0.79661     0.0       0.0)

# Read Data

In [None]:
from google.colab import drive

import pandas as pd
from google.colab import files
import io

'''
From Google Drive
'''
drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/Classification Model/COT/Evaluation/mergeddata_withQuestion_9_13_23/2a_v2_gpt-4_index0_to_41.csv' #add the file
# df = pd.read_csv(file_path)

'''
by Upload
'''
# uploaded = files.upload()
# df = pd.read_csv(io.BytesIO(uploaded['EDM_master - EDM_master.csv']))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'\nby Upload\n'

In [None]:
recall_all = []
prec_all = []
count_all = []
num_class_all = []
accuracy_all = []
gpt_models_all = []
prompt_vers_all  = []

# gpt_models = ['gpt-3.5-turbo', 'gpt-4']
# prompt_vers = ['1a_v1', '2a_v1','1a_v2', '2a_v2']


# index_name  = '_index0_to_４１_v2'
index_name  = '_index0_to_110_v0'
# index_name  = '_index0_to_134'


# prompt_vers = ['1a_v2_assertion_dos_and_donts']

# gpt_models = ['gpt-4','gpt-4','gpt-4']


gpt_models = ['gpt-4']
prompt_vers = [ 'pure_fewshot_specificCOT', 'pure_fewshot_specificCOT_genCOT', 'pure_fewshot_specificCOT_assertion', 'pure_fewshot_specificCOT_genCOT_assertion']
# prompt_vers = ['pure_fewshot_specificCOT_genCOT']

for i, gpt_model in enumerate(gpt_models):
  for prompt_ver in prompt_vers:
    try:
      pred_dir = '/content/drive/My Drive/Data_ClassificationModel/'

      # pred_dir = '/content/drive/MyDrive/Classification Model/COT/Evaluation/mergeddata_withQuestion_9_13_23_preds/csv_files/'
      # pred_dir = '/content/drive/MyDrive/Classification Model/COT/Evaluation/updated_test_wquestion_9_13_23_preds/'

      # pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name + "_v" +str(i) + '.csv'
      pred_path =  pred_dir + prompt_ver + "_" + gpt_model + index_name + '.csv'
      df = pd.read_csv(pred_path)


      print(pred_path)
      # df = pd.read_csv('/content/drive/MyDrive/Data_ClassificationModel/1a_v2_assertion_gpt-4_index0_to_51.csv')
      # df_1 = pd.read_csv('/content/drive/MyDrive/Data_ClassificationModel/1a_v2_assertion_gpt-4_index0_to_51.csv')
      # df_2 = pd.read_csv('/content/drive/MyDrive/Data_ClassificationModel/1a_v2_assertion_gpt-4_index51_to_134.csv')
      # df = df_1.append(df_2, ignore_index=True)


      y_true = df['label'].tolist()

      # y_pred = df['pred_label'].tolist()

      df['pred_label_fixed'] = df.apply(lambda row: extract_pred_label_COT(row['pred_response']), axis=1)
      y_pred = df['pred_label_fixed'].tolist()

      df.to_csv(pred_dir + prompt_ver + "_" + gpt_model + index_name +'_fixed_label.csv', index=False)


      print('----')
      classes = tuple(set(y_pred))
      print(tuple(set(y_true)))
      print(classes)
      if len(classes) < 4 and max(classes)==3 :
        classes = (0,1,2,3)
      print(classes)

      recall_list, prec_list,  count_list, num_cls = show_confusionmatrix(y_true,y_pred)

      recall_all.append(recall_list)
      prec_all.append(prec_list)
      count_all.append(count_list)
      num_class_all.append(num_cls)

      gpt_models_all.append(gpt_model)
      prompt_vers_all.append(prompt_ver)

      accuracy_all.append(accuracy(y_true,y_pred))

    except Exception as error:
       print("An exception occurred:", error)

out_dir = '/content/drive/My Drive/Data_ClassificationModel/'


create_dataframe(recall_all, num_class_all, out_dir+'recall_all.csv', gpt_models_all, prompt_vers_all )
create_dataframe(prec_all, num_class_all, out_dir + 'prec_all.csv', gpt_models_all, prompt_vers_all)
create_dataframe(count_all, num_class_all,out_dir + 'count_all.csv', gpt_models_all, prompt_vers_all)
create_dataframe_acc(accuracy_all, num_class_all, out_dir +'acc_all.csv', gpt_models_all, prompt_vers_all)

prec_csv = out_dir + 'prec_all.csv'
recall_csv = out_dir + 'recall_all.csv'
count_csv = out_dir + 'count_all.csv'

output_f1_csv = out_dir + 'f1_score.csv'
output_prec_recall_simple_csv =  out_dir + 'prec_recall_simple.csv'


compute_f1(prec_csv, recall_csv, count_csv, output_f1_csv, output_prec_recall_simple_csv, num_class=3)


/content/drive/My Drive/Data_ClassificationModel/pure_fewshot_specificCOT_gpt-4_index0_to_110_v0.csv
----
(0, 1, 2)
(0, 1, 2)
(0, 1, 2)
/content/drive/My Drive/Data_ClassificationModel/pure_fewshot_specificCOT_genCOT_gpt-4_index0_to_110_v0.csv
----
(0, 1, 2)
(0, 1, 2)
(0, 1, 2)
/content/drive/My Drive/Data_ClassificationModel/pure_fewshot_specificCOT_assertion_gpt-4_index0_to_110_v0.csv
----
(0, 1, 2)
(0, 1, 2)
(0, 1, 2)
/content/drive/My Drive/Data_ClassificationModel/pure_fewshot_specificCOT_genCOT_assertion_gpt-4_index0_to_110_v0.csv
----
(0, 1, 2)
(0, 1, 2)
(0, 1, 2)


(   model                                     prompt  0_perclass_f1  \
 0  gpt-4                   pure_fewshot_specificCOT       0.909091   
 1  gpt-4            pure_fewshot_specificCOT_genCOT       0.909091   
 2  gpt-4         pure_fewshot_specificCOT_assertion       0.909091   
 3  gpt-4  pure_fewshot_specificCOT_genCOT_assertion       0.909091   
 
    0_weighted_f1  1_perclass_f1  1_weighted_f1  2_perclass_f1  2_weighted_f1  \
 0        0.30303       0.615385       0.205128       0.333333       0.111111   
 1        0.30303       0.750000       0.250000       0.909091       0.303030   
 2        0.30303       0.666667       0.222222       0.800000       0.266667   
 3        0.30303       0.750000       0.250000       0.909091       0.303030   
 
    macro_average_skipNA  weighted_average_skipNA  macro_average_NAas0  \
 0              0.619270                 0.619270             0.619270   
 1              0.856061                 0.856061             0.856061   
 2            

In [None]:
# out_dir = '/content/drive/MyDrive/Classification Model/COT/COT refinement 9_14_2023/Evaluation_ASST1/updated_test_wquestion_9_13_23_ASST1_preds/'
# out_dir = '/content/drive/My Drive/Data_ClassificationModel/'

# prec_csv = out_dir + 'prec_all.csv'
# recall_csv = out_dir + 'recall_all.csv'
# count_csv = out_dir + 'count_all.csv'

# output_f1_csv = out_dir + 'f1_score.csv'
# output_prec_recall_simple_csv =  out_dir + 'prec_recall_simple.csv'

# compute_f1(prec_csv, recall_csv, count_csv, output_f1_csv, output_prec_recall_simple_csv, num_class=3)

In [None]:
gpt_model = 'gpt-4'
prompt_ver = 'pure_fewshot_specificCOT'

pred_dir = '/content/drive/My Drive/Data_ClassificationModel/'

pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name + "_v1" + '.csv'
df_1 = pd.read_csv(pred_path)
df_1['ver'] = 'ver1'
pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name + "_v2" + '.csv'
df_2 = pd.read_csv(pred_path)
df_2['ver'] = 'ver2'
pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name + "_v3" + '.csv'
df_3 = pd.read_csv(pred_path)
df_3['ver'] = 'ver3'

df = df_1.append(df_2, ignore_index=True)

df = df.append(df_3, ignore_index=True)

df = df.sort_values(by=['id_internal'])

df.to_csv(pred_dir+'allpred.csv', index=False)

  df = df_1.append(df_2, ignore_index=True)
  df = df.append(df_3, ignore_index=True)


In [None]:
gpt_model = 'gpt-4'
prompt_ver = 'pure_fewshot_specificCOT_genCOT'

pred_dir = '/content/drive/My Drive/Data_ClassificationModel/'


index_name  = '_index0_to_57'
pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name  + '.csv'
df_1 = pd.read_csv(pred_path)

index_name  = '_index57_to_134'
pred_path = pred_dir + prompt_ver + "_" + gpt_model + index_name + '.csv'
df_2 = pd.read_csv(pred_path)

df = df_1.append(df_2, ignore_index=True)

index_name  = '_index0_to_134'
df.to_csv(pred_dir + prompt_ver + "_" + gpt_model + index_name  + '.csv', index=False)

  df = df_1.append(df_2, ignore_index=True)
