# Get accuracy and F1 for all models

In [1]:
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import re
import io
import pickle

## from text

In [2]:
test_set=pd.read_csv("./test_yelp_1k.csv")
gpt3=pd.read_csv("./text-davinci-003_test_1k.csv")
llama=pd.read_csv("./pkl_pred/llama7b_test_1k.csv")
alpaca=pd.read_csv("./pkl_pred/alpaca7b_plain_test_1k.csv")
gpt2=pd.read_csv("./pkl_pred/gpt2xl_test_1k.csv")
gpt35=pd.read_csv("./gpt-3.5-turbo_test_1k.csv")
df_subsets=pd.read_csv("./test_1k_subsets.csv")
gpt4=pd.read_csv("./gpt-4_test_1k.csv")
prompts=pd.read_csv("./prompts_test_1k.csv")


In [3]:
def parse_prediction(df,start=None,end=None):
    if start:
        df['pred_label']=df.pred.str[start:]
    elif end:
        df['pred_label']=df.pred.str[:end]
    else:
        df['pred_label']=df.pred
    df['pred_label'] = df['pred_label'].str.replace("one","1").replace("two","2").replace("three","3").replace("four","4").replace("five","5")
    df['pred_label'] = df['pred_label'].str.extract(r'(\d)')
    #df['pred_label']=df['pred_label'].str.extract(r'(\d+)')
    #df=df.loc[df.pred_label.isin(["1",'2',"3",'4','5'])]
    df['pred_label']=df['pred_label'].fillna(99)
    df['pred_label']=df['pred_label'].apply(int)
    df['pred_label']=df['pred_label']-1
    return df

In [4]:
def replace_prompt(df,prompts):
    df=df.merge(prompts,on=['review_id','prompt_type','prompt_id'],how='left')
    escaped_values = [re.escape(value) for value in df['prompt']]
    pattern = '|'.join(escaped_values)
    # Replace content of column2 with an empty string if it exists in column1
    df['pred'] = df['pred'].str.replace(pattern, '', regex=True)
    df=df.loc[:,['pred', 'review_id', 'prompt_type', 'prompt_id']]
    return df

## Clean predictions

In [5]:
#alpaca['pred'] = alpaca['pred'].str.extract(r'### Response:(.*)')
alpaca=replace_prompt(alpaca,prompts)

llama=replace_prompt(llama,prompts)

gpt2=replace_prompt(gpt2,prompts)

In [6]:
alpaca=parse_prediction(alpaca.copy(),end=45)
llama=parse_prediction(llama.copy(),end=45)
gpt2=parse_prediction(gpt2.copy(),end=15)
gpt3=parse_prediction(gpt3.copy())
gpt35=parse_prediction(gpt35.copy())
gpt4=parse_prediction(gpt4.copy())

In [7]:
alpaca=alpaca.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_alpaca'})
llama=llama.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_llama'})
gpt2=gpt2.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_gpt2'})
gpt3=gpt3.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_gpt3'})
gpt35=gpt35.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_gpt35'})
gpt4=gpt4.loc[:,['review_id','prompt_type','prompt_id','pred_label']].rename(columns={'pred_label':'pred_gpt4'})

In [8]:
all_pred=gpt3.merge(gpt2,on=['review_id','prompt_type','prompt_id'],how='outer').merge(llama,on=['review_id','prompt_type','prompt_id'],how='outer').merge(alpaca,on=['review_id','prompt_type','prompt_id'],how='outer').merge(gpt35,on=['review_id','prompt_type','prompt_id'],how='outer').merge(gpt4,on=['review_id','prompt_type','prompt_id'],how='outer')

In [9]:
test_set=test_set.merge(all_pred,on=['review_id'],how='left')

In [10]:
test_set.shape

(15000, 11)

## get info subsets

In [11]:
test_set=test_set.merge(df_subsets,on='review_id',how='left')

In [12]:
seed_value = 0
np.random.seed(seed_value)

random_values = np.random.randint(0, 5, size=test_set.shape[0])

test_set['pred_random']=random_values

### gpt2_proba
- for gpt2 get probabilities

In [13]:
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)


def get_key_with_max_value(dictionary):
    try:
        key=max(dictionary, key=dictionary.get)
    except:
        print(dictionary)
        return None
    return key

def get_key_with_max_value_options(dictionary):
    try:
        new_dict={}
        new_dict[' 1']=dictionary.get('1',0)+dictionary.get(' 1',0)+dictionary.get('one',0)+dictionary.get(' one',0)
        new_dict[' 2']=dictionary.get('2',0)+dictionary.get(' 2',0)+dictionary.get('two',0)+dictionary.get(' two',0)
        new_dict[' 3']=dictionary.get('3',0)+dictionary.get(' 3',0)+dictionary.get('three',0)+dictionary.get(' three',0)
        new_dict[' 4']=dictionary.get('4',0)+dictionary.get(' 4',0)+dictionary.get('four',0)+dictionary.get(' four',0)
        new_dict[' 5']=dictionary.get('5',0)+dictionary.get(' 5',0)+dictionary.get('five',0)+dictionary.get(' five',0)
        key=max(new_dict, key=new_dict.get)
    except Exception as e:
        print(e)
        print(dictionary)
        return None
    return key

def get_key_value_with_max_value(dictionary):
    max_key = max(dictionary, key=dictionary.get)
    max_value = dictionary[max_key]
    return max_value


In [14]:
with open('./pkl_pred/res_gpt2xl_test_1k.pkl', 'rb') as file:
    #res_c1 = pickle.load(file)
    contents = CPU_Unpickler(file).load()

In [15]:
responses=pd.DataFrame(contents)

gt=test_set.loc[:,['review_id','prompt_type', 'prompt_id','label']].copy()

responses=gt.merge(responses,on=['review_id','prompt_type', 'prompt_id'],how='left')

p0=responses.probas_0.apply(pd.Series)

In [16]:
responses['max_value_0'] = responses['probas_0'].apply(lambda x: get_key_with_max_value_options(x))

'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'get'
nan
'float' object has no attribute 'g

In [17]:
map={' 1':1, ' 5':5, ' 3':3, ' 4':4, ' 2':2, ' one':1, ' five':5, ' three':3,' four':4, ' two':2}

responses['pred_gpt2_proba']=responses['max_value_0'].map(map)
responses['pred_gpt2_proba']=responses['pred_gpt2_proba'].fillna(99)
responses['pred_gpt2_proba']=responses['pred_gpt2_proba'].apply(int)
responses['pred_gpt2_proba']=responses['pred_gpt2_proba']-1

responses_proba_gpt=responses.loc[:,['review_id','prompt_type', 'prompt_id','pred_gpt2_proba']]

## Predictions prompt c0

In [18]:
test_set=test_set.merge(responses_proba_gpt,on=['review_id','prompt_type','prompt_id'],how='left')

In [19]:
working_df=test_set.loc[test_set.prompt_type==0].copy()

In [None]:
accuracies=[]
f1s=[]
models=[]
prompt_ids=[]
for model in ["random","gpt2","gpt3","llama","alpaca","gpt35","gpt4","gpt2_proba"]:
    for prompt_id in [0,1,2,3,4]:
        print("prompt_id ",prompt_id)
        df_temp=working_df.loc[( (working_df.prompt_id==prompt_id)& (working_df['pred_'+model].isin([0,1,2,3,4])))]
        print(model)
        print(classification_report(df_temp.label,df_temp['pred_'+model]))
        rep=classification_report(df_temp.label,df_temp['pred_'+model],output_dict=True,digits=4)
        accuracies.append(rep['accuracy']*100)
        f1s.append(rep['weighted avg']['f1-score']*100)
        models.append(model)
        prompt_ids.append(prompt_id)

In [21]:
results={'model':models,'prompt_id':prompt_ids,'accuracy':accuracies,'f1':f1s}
results_df=pd.DataFrame(results)

results_df=results_df.groupby(['model']).agg({'accuracy':[np.mean,np.std],'f1':[np.mean,np.std]})

results_df=round(results_df,2)

results_df.columns =['_'.join(col) for col in results_df.columns]

results_df=results_df.reset_index()
order={"random":0,'gpt2_proba':1,'llama':2,'alpaca':3,'gpt3':4,'gpt35':5,'gpt4':6,'gpt2':11,"llama13":12}
results_df['order']=results_df.model.map(order)
results_df=results_df.sort_values('order')
table = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['accuracy_mean'], x['accuracy_std']), axis=1)

table_f1 = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['f1_mean'], x['f1_std']), axis=1)


In [22]:
results_df

Unnamed: 0,model,accuracy_mean,accuracy_std,f1_mean,f1_std,order
7,random,19.84,0.54,19.86,0.6,0
2,gpt2_proba,23.06,2.1,10.23,4.12,1
6,llama,39.28,5.07,31.78,5.32,2
0,alpaca,47.72,4.19,46.01,5.35,3
3,gpt3,53.22,1.35,52.71,1.73,4
4,gpt35,58.36,4.13,57.98,5.11,5
5,gpt4,59.84,4.17,59.54,4.69,6
1,gpt2,22.89,2.2,10.26,3.81,11


In [23]:
print(' & '.join(table_f1))

19.86 {\tiny$\pm$0.60} & 10.23 {\tiny$\pm$4.12} & 31.78 {\tiny$\pm$5.32} & 46.01 {\tiny$\pm$5.35} & 52.71 {\tiny$\pm$1.73} & 57.98 {\tiny$\pm$5.11} & 59.54 {\tiny$\pm$4.69} & 10.26 {\tiny$\pm$3.81}


In [24]:
print(' & '.join(table))

19.84 {\tiny$\pm$0.54} & 23.06 {\tiny$\pm$2.10} & 39.28 {\tiny$\pm$5.07} & 47.72 {\tiny$\pm$4.19} & 53.22 {\tiny$\pm$1.35} & 58.36 {\tiny$\pm$4.13} & 59.84 {\tiny$\pm$4.17} & 22.89 {\tiny$\pm$2.20}


## results by subset

In [25]:
test_set_c1=test_set.loc[((test_set.pred_class=='C1') & (test_set.prompt_type==0)),:]
test_set_c2=test_set.loc[((test_set.pred_class=='C2') & (test_set.prompt_type==0)),:]

In [26]:
working_df_sub=test_set_c2.copy()

In [None]:
accuracies=[]
f1s=[]
models=[]
prompt_ids=[]
for model in ["random","gpt2","gpt3","llama","alpaca","gpt35","gpt4","gpt2_proba"]:
    for prompt_id in [0,1,2,3,4]:
        print("prompt_id ",prompt_id)
        df_temp=working_df_sub.loc[( (working_df_sub.prompt_id==prompt_id)& (working_df_sub['pred_'+model].isin([0,1,2,3,4])))]
        print(model)
        print(classification_report(df_temp.label,df_temp['pred_'+model]))
        rep=classification_report(df_temp.label,df_temp['pred_'+model],output_dict=True,digits=4)
        accuracies.append(rep['accuracy']*100)
        f1s.append(rep['weighted avg']['f1-score']*100)
        models.append(model)
        prompt_ids.append(prompt_id)

In [28]:
results={'model':models,'prompt_id':prompt_ids,'accuracy':accuracies,'f1':f1s}
results_df=pd.DataFrame(results)

results_df=results_df.groupby(['model']).agg({'accuracy':[np.mean,np.std],'f1':[np.mean,np.std]})

results_df=round(results_df,2)

results_df.columns =['_'.join(col) for col in results_df.columns]

results_df=results_df.reset_index()
order={"random":0,'gpt2_proba':1,'llama':2,'alpaca':3,'gpt3':4,'gpt35':5,'gpt4':6,'gpt2':11,"llama13":12}
results_df['order']=results_df.model.map(order)
results_df=results_df.sort_values('order')
table = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['accuracy_mean'], x['accuracy_std']), axis=1)

table_f1 = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['f1_mean'], x['f1_std']), axis=1)


In [29]:
results_df

Unnamed: 0,model,accuracy_mean,accuracy_std,f1_mean,f1_std,order
7,random,19.85,1.92,20.69,2.04,0
2,gpt2_proba,27.09,2.59,13.43,5.08,1
6,llama,37.4,6.34,31.44,6.31,2
0,alpaca,49.22,4.73,50.06,5.19,3
3,gpt3,61.89,1.0,62.77,1.03,4
4,gpt35,55.81,6.65,57.18,7.7,5
5,gpt4,58.78,6.36,60.38,6.42,6
1,gpt2,26.99,2.56,13.66,4.58,11


In [30]:
print(' & '.join(table_f1))

20.69 {\tiny$\pm$2.04} & 13.43 {\tiny$\pm$5.08} & 31.44 {\tiny$\pm$6.31} & 50.06 {\tiny$\pm$5.19} & 62.77 {\tiny$\pm$1.03} & 57.18 {\tiny$\pm$7.70} & 60.38 {\tiny$\pm$6.42} & 13.66 {\tiny$\pm$4.58}


In [31]:
print(' & '.join(table))

19.85 {\tiny$\pm$1.92} & 27.09 {\tiny$\pm$2.59} & 37.40 {\tiny$\pm$6.34} & 49.22 {\tiny$\pm$4.73} & 61.89 {\tiny$\pm$1.00} & 55.81 {\tiny$\pm$6.65} & 58.78 {\tiny$\pm$6.36} & 26.99 {\tiny$\pm$2.56}


## prompt c1 & c2

In [32]:
df_subsets_500=df_subsets.head(500)

In [33]:
test_set_c1=test_set.loc[((test_set.pred_class=='C1') & (test_set.prompt_type==2)),:]
test_set_c2=test_set.loc[((test_set.pred_class=='C2') & (test_set.prompt_type==2)),:]

In [34]:
working_df_sub=test_set_c1.loc[test_set_c1.review_id.isin(df_subsets_500.review_id.unique())].copy()

In [None]:
accuracies=[]
f1s=[]
models=[]
prompt_ids=[]
for model in ["random","gpt2","gpt3","llama","alpaca","gpt35","gpt4","gpt2_proba"]:
    for prompt_id in [0,1,2,3,4]:
        print("prompt_id ",prompt_id)
        df_temp=working_df_sub.loc[( (working_df_sub.prompt_id==prompt_id)& (working_df_sub['pred_'+model].isin([0,1,2,3,4])))]
        print(model)
        print(classification_report(df_temp.label,df_temp['pred_'+model]))
        rep=classification_report(df_temp.label,df_temp['pred_'+model],output_dict=True,digits=4)
        accuracies.append(rep['accuracy']*100)
        f1s.append(rep['weighted avg']['f1-score']*100)
        models.append(model)
        prompt_ids.append(prompt_id)

In [36]:
results={'model':models,'prompt_id':prompt_ids,'accuracy':accuracies,'f1':f1s}
results_df=pd.DataFrame(results)

results_df=results_df.groupby(['model']).agg({'accuracy':[np.mean,np.std],'f1':[np.mean,np.std]})

results_df=round(results_df,2)

results_df.columns =['_'.join(col) for col in results_df.columns]

results_df=results_df.reset_index()
order={"random":0,'gpt2_proba':1,'llama':2,'alpaca':3,'gpt3':4,'gpt35':5,'gpt4':6,'gpt2':11,"llama13":12}
results_df['order']=results_df.model.map(order)
results_df=results_df.sort_values('order')
table = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['accuracy_mean'], x['accuracy_std']), axis=1)

table_f1 = results_df.apply(lambda x: "{:.2f} {{\\tiny$\pm${:.2f}}}".format(x['f1_mean'], x['f1_std']), axis=1)


In [37]:
results_df

Unnamed: 0,model,accuracy_mean,accuracy_std,f1_mean,f1_std,order
7,random,19.73,2.1,20.17,2.3,0
2,gpt2_proba,25.69,6.74,18.91,4.95,1
6,llama,48.32,8.38,41.83,8.77,2
0,alpaca,46.74,3.8,41.26,4.36,3
3,gpt3,59.8,1.92,57.82,1.11,4
4,gpt35,55.01,5.99,54.38,6.43,5
5,gpt4,60.13,3.27,59.96,2.92,6
1,gpt2,30.35,5.53,21.51,4.35,11


In [38]:
print(' & '.join(table_f1))

20.17 {\tiny$\pm$2.30} & 18.91 {\tiny$\pm$4.95} & 41.83 {\tiny$\pm$8.77} & 41.26 {\tiny$\pm$4.36} & 57.82 {\tiny$\pm$1.11} & 54.38 {\tiny$\pm$6.43} & 59.96 {\tiny$\pm$2.92} & 21.51 {\tiny$\pm$4.35}


In [39]:
print(' & '.join(table))

19.73 {\tiny$\pm$2.10} & 25.69 {\tiny$\pm$6.74} & 48.32 {\tiny$\pm$8.38} & 46.74 {\tiny$\pm$3.80} & 59.80 {\tiny$\pm$1.92} & 55.01 {\tiny$\pm$5.99} & 60.13 {\tiny$\pm$3.27} & 30.35 {\tiny$\pm$5.53}
