In [13]:
import pandas as pd

# Provided data
models = ["BERTweet", "HGBC", "SVM", "CNN"]
files = ["data/val_predictions_bertweet.csv", "data/val_predictions_tf_idf_hgbc.csv", "data/val_predictions_tf_idf_svm.csv", "data/val_predictions_cnn.csv"]

# Step 1: Read the CSV files
dfs = [pd.read_csv(file, usecols=["tweet", "label", "prediction"]) for file in files]

# Step 2: Concatenate the DataFrames
# Assuming the 'tweet' and 'label' columns are identical across files
merged_df = pd.concat([df.set_index(['tweet', 'label']) for df in dfs], axis=1)
merged_df.columns = models

# Step 3: Create Error Columns
for model in models:
    merged_df[f"{model}_first"] = False
    merged_df.loc[(merged_df.index.get_level_values('label') == 0) & (merged_df[model] == 1), f"{model}_first"] = True
    merged_df[f"{model}_second"] = False
    merged_df.loc[(merged_df.index.get_level_values('label') == 1) & (merged_df[model] == 0), f"{model}_second"] = True
    

# Step 4: Filter Rows with Errors
error_rows = merged_df[merged_df["BERTweet_first"] | merged_df["BERTweet_second"] \
    | merged_df["HGBC_first"] | merged_df["HGBC_second"] \
    | merged_df["SVM_first"] | merged_df["SVM_second"] \
    | merged_df["CNN_first"] | merged_df["CNN_second"]]

# Resetting index to bring 'tweet' and 'label' back as columns
final_df = error_rows.reset_index()
df_10 = final_df.sample(n=10)
df_10

Unnamed: 0,tweet,label,BERTweet,HGBC,SVM,CNN,BERTweet_first,BERTweet_second,HGBC_first,HGBC_second,SVM_first,SVM_second,CNN_first,CNN_second
265,Neil_Eastwood77: I AM A KNOBHEAD!! Bin Laden f...,1,0,1,1,1,False,True,False,False,False,False,False,False
882,Rt hirochii0: There is no country that making ...,1,0,1,1,1,False,True,False,False,False,False,False,False
674,she's a natural disaster she's the last of the...,0,1,0,0,0,True,False,False,False,False,False,False,False
513,@eeenice221 true because of the truck that cau...,1,1,0,0,1,False,False,False,True,False,True,False,False
862,@KurtSchlichter He's already done it by negot...,1,0,0,0,1,False,True,False,True,False,True,False,False
802,In your eyes I see the hope\r\nI once knew.\r\...,0,1,0,0,0,True,False,False,False,False,False,False,False
666,13 reasons why we love women in the military ...,0,1,0,0,0,True,False,False,False,False,False,False,False
153,E-Hutch is da bomb ?? http://t.co/aqmpxzo3V1,0,0,0,0,1,False,False,False,False,False,False,True,False
619,My baby girls car wreak this afternoon thank G...,1,0,0,0,0,False,True,False,True,False,True,False,True
917,They turned Jasmines house into a war zone. ??...,0,1,0,0,0,True,False,False,False,False,False,False,False


In [19]:
# only keep the tweet, label and model columns
df_10_small = df_10[["tweet", "label", "BERTweet", "HGBC", "SVM", "CNN"]]
# order by label
df_10_small = df_10_small.sort_values(by=['label'])
# replace label 0 with 'no disaster' and label 1 with 'disaster'
df_10_small['label'] = df_10_small['label'].replace([0, 1], ['no disaster', 'disaster'])
df_10_small['BERTweet'] = df_10_small['BERTweet'].replace([0, 1], ['no disaster', 'disaster'])
df_10_small['HGBC'] = df_10_small['HGBC'].replace([0, 1], ['no disaster', 'disaster'])
df_10_small['SVM'] = df_10_small['SVM'].replace([0, 1], ['no disaster', 'disaster'])
df_10_small['CNN'] = df_10_small['CNN'].replace([0, 1], ['no disaster', 'disaster'])

df_10_small

Unnamed: 0,tweet,label,BERTweet,HGBC,SVM,CNN
674,she's a natural disaster she's the last of the...,no disaster,disaster,no disaster,no disaster,no disaster
802,In your eyes I see the hope\r\nI once knew.\r\...,no disaster,disaster,no disaster,no disaster,no disaster
666,13 reasons why we love women in the military ...,no disaster,disaster,no disaster,no disaster,no disaster
153,E-Hutch is da bomb ?? http://t.co/aqmpxzo3V1,no disaster,no disaster,no disaster,no disaster,disaster
917,They turned Jasmines house into a war zone. ??...,no disaster,disaster,no disaster,no disaster,no disaster
265,Neil_Eastwood77: I AM A KNOBHEAD!! Bin Laden f...,disaster,no disaster,disaster,disaster,disaster
882,Rt hirochii0: There is no country that making ...,disaster,no disaster,disaster,disaster,disaster
513,@eeenice221 true because of the truck that cau...,disaster,disaster,no disaster,no disaster,disaster
862,@KurtSchlichter He's already done it by negot...,disaster,no disaster,no disaster,no disaster,disaster
619,My baby girls car wreak this afternoon thank G...,disaster,no disaster,no disaster,no disaster,no disaster


In [18]:
df_10_small.to_csv("data/val_errors.csv", index=False)