In [2]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
train_boosting_df = pd.read_csv('../Boosting Data/Train_v2.csv')
val_boosting_df = pd.read_csv('../Boosting Data/Validation_v2.csv')
train_df = pd.read_csv('../../data/Constraint_Train.csv')
val_df = pd.read_csv('../../data/Constraint_Val.csv')
test_df = pd.read_csv('../../data/Constraint_Test.csv')
test_label_df = pd.read_csv('/../../data/english_test_with_labels.csv')
test_boosting_df = pd.read_csv('../Boosting Data/Test_v2.csv')


In [5]:
train_boosting_df.shape, val_boosting_df.shape, test_boosting_df.shape

((6420, 10), (2140, 10), (2140, 9))

In [10]:
ernie_val = pd.read_csv('../Intermediate ERNIE2.0/ERNIE/val_ernie2.0_results.csv')
ernie_val = ernie_val.rename(columns={'Model4_class0': 'Model5_class0', 'Model4_class1': 'Model5_class1'})[["Model5_class0", "Model5_class1", "Label"]]
ernie_test = pd.read_csv('../Intermediate ERNIE2.0/ERNIE/test_ernie2.0_results.csv')
ernie_test = ernie_test.rename(columns={'Model4_class0': 'Model5_class0', 'Model4_class1': 'Model5_class1'})[["Model5_class0", "Model5_class1"]]


In [11]:
test_boosting_df.head(3)

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983


In [None]:
val_labels = val_boosting_df['Label'].values
val_boosting_df = pd.concat([val_boosting_df.iloc[:, :-1], ernie_val[["Model5_class0", "Model5_class1"]]], 1)
val_boosting_df["Label"] = val_labels
val_boosting_df.head()


Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Model5_class0,Model5_class1,Label
0,1,0.001269,0.998731,0.000938,0.999062,0.000179,0.999821,0.028954,0.971046,0.00025,0.99975,1
1,2,0.991674,0.008326,0.000884,0.999116,0.010019,0.989981,0.002718,0.997282,0.005722,0.994278,1
2,3,0.000385,0.999615,0.000958,0.999042,8.4e-05,0.999916,5.3e-05,0.999947,0.000249,0.999751,1
3,4,0.000416,0.999584,0.000936,0.999065,0.000109,0.999891,0.001268,0.998732,0.000251,0.999749,1
4,5,0.999851,0.000149,0.999052,0.000948,0.999784,0.000216,0.999951,4.9e-05,0.999373,0.000627,0


In [None]:
test_boosting_df = pd.concat([test_boosting_df, ernie_test[["Model5_class0", "Model5_class1"]]], 1)
test_boosting_df.head()

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Model5_class0,Model5_class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999371,0.000629
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,0.000257,0.999743
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983,0.000247,0.999753
3,4,0.999858,0.000142,0.999082,0.000918,0.999967,3.3e-05,0.999956,4.4e-05,0.999376,0.000624
4,5,0.999852,0.000148,0.999076,0.000924,0.999953,4.7e-05,0.999912,8.8e-05,0.994812,0.005188


In [None]:
ernie_val.shape, ernie_test.shape

((2140, 3), (2140, 2))

In [None]:
test_boosting_df.shape

(2140, 11)

In [None]:
test_boosting_df.head()

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Model5_class0,Model5_class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999371,0.000629
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,0.000257,0.999743
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983,0.000247,0.999753
3,4,0.999858,0.000142,0.999082,0.000918,0.999967,3.3e-05,0.999956,4.4e-05,0.999376,0.000624
4,5,0.999852,0.000148,0.999076,0.000924,0.999953,4.7e-05,0.999912,8.8e-05,0.994812,0.005188


In [6]:
def soft_voting(row):
  # class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'] + row['Model5_class0'])/5
  # class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'] + row['Model5_class1'])/5
  class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'])/4
  class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'])/4
  return 0 if class0_val > class1_val else 1

train_boosting_df['Soft_Vote'] = train_boosting_df.apply(lambda x: soft_voting(x), 1)
val_boosting_df['Soft_Vote'] = val_boosting_df.apply(lambda x: soft_voting(x), 1)
test_boosting_df['Soft_Vote'] = test_boosting_df.apply(lambda x: soft_voting(x), 1)


In [7]:
test_boosting_df.head(2)

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Soft_Vote
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,1


In [8]:
def hard_voting(row):
  a = np.argmax(np.array([[row['model1_Class0'], row['model1_Class1']]]), 1)[0]
  b = np.argmax(np.array([[row['model2_Class0'], row['model2_Class1']]]), 1)[0]
  c = np.argmax(np.array([[row['model3_Class0'], row['model3_Class1']]]), 1)[0]
  d = np.argmax(np.array([[row['model4_Class0'], row['model4_Class1']]]), 1)[0]
  # e = np.argmax(np.array([[row['Model5_class0'], row['Model5_class1']]]), 1)[0]

  # cnt_0 = 0
  # for i in [a,b,c,d,e]:
  #   if i==0:
  #     cnt_0+=1
  
  # if cnt_0>=3:
  #   return 0
  # else:
  #   return 1
  cnt_0 = 0
  for i in [a,b,c,d]:
    if i==0:
      cnt_0+=1
  
  if cnt_0==2:
    class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'])/4
    class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'])/4
    return 0 if class0_val > class1_val else 1
  elif cnt_0>2:
    return 0
  else:
    return 1

train_boosting_df['Hard_Vote'] = train_boosting_df.apply(lambda x: hard_voting(x), 1)
val_boosting_df['Hard_Vote'] = val_boosting_df.apply(lambda x: hard_voting(x), 1)
test_boosting_df['Hard_Vote'] = test_boosting_df.apply(lambda x: hard_voting(x), 1)


In [None]:
# def hard_voting(row):
#   a = np.argmax(np.array([[row['model1_Class0'], row['model1_Class1']]]), 1)[0]
#   b = np.argmax(np.array([[row['model2_Class0'], row['model2_Class1']]]), 1)[0]
#   c = np.argmax(np.array([[row['model3_Class0'], row['model3_Class1']]]), 1)[0]
#   d = np.argmax(np.array([[row['model4_Class0'], row['model4_Class1']]]), 1)[0]

#   cnt_0 = 0
#   for i in [a,b,c,d]:
#     if i==0:
#       cnt_0+=1
  
#   if cnt_0>2:
#     return 0
#   elif cnt_0==2:
#     class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'])/4
#     class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'])/4
#     return 0 if class0_val > class1_val else 1                                                                                                              
#   else:
#     return 1

# val_boosting_df['Hard_Vote'] = val_boosting_df.apply(lambda x: hard_voting(x), 1)
# test_boosting_df['Hard_Vote'] = test_boosting_df.apply(lambda x: hard_voting(x), 1)


In [9]:
def label_id_to_label(row, col):
  return 'real' if row[col]==0 else 'fake'

train_boosting_df['label'] = train_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)
val_boosting_df['label'] = val_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)
test_boosting_df['label'] = test_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)


In [18]:
test_boosting_df = test_boosting_df[["id", "label"]]
test_boosting_df.head()

Unnamed: 0,id,label
0,1,real
1,2,fake
2,3,fake
3,4,real
4,5,real


In [None]:
# test_df.to_csv('/content/drive/MyDrive/Constraint/Submission_v3.csv', index=False)

In [None]:
confusion_matrix(val_boosting_df['Label'].values, val_boosting_df['Soft_Vote'].values)

array([[1109,   11],
       [  26,  994]])

In [None]:
accuracy_score(val_boosting_df['Label'].values, val_boosting_df['Soft_Vote'].values)

0.9827102803738318

In [None]:
confusion_matrix(val_boosting_df['Label'].values, val_boosting_df['Hard_Vote'].values)

array([[1111,    9],
       [  27,  993]])

In [None]:
accuracy_score(val_boosting_df['Label'].values, val_boosting_df['Hard_Vote'].values)

0.983177570093458

In [None]:
f1_score(val_boosting_df['Label'].values, val_boosting_df['Soft_Vote'].values)

0.9817283950617283

In [None]:
def get_label(row):
  return 'real' if row["Label"]==0 else 'fake'

val_boosting_df["Actual_Label"] = val_boosting_df.apply(lambda x: get_label(x), 1)

In [None]:
val_boosting_df.head()

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Model5_class0,Model5_class1,Label,Soft_Vote,Hard_Vote,label,Actual_Label
0,1,0.001269,0.998731,0.000938,0.999062,0.000179,0.999821,0.028954,0.971046,0.00025,0.99975,1,1,1,fake,fake
1,2,0.991674,0.008326,0.000884,0.999116,0.010019,0.989981,0.002718,0.997282,0.005722,0.994278,1,1,1,fake,fake
2,3,0.000385,0.999615,0.000958,0.999042,8.4e-05,0.999916,5.3e-05,0.999947,0.000249,0.999751,1,1,1,fake,fake
3,4,0.000416,0.999584,0.000936,0.999065,0.000109,0.999891,0.001268,0.998732,0.000251,0.999749,1,1,1,fake,fake
4,5,0.999851,0.000149,0.999052,0.000948,0.999784,0.000216,0.999951,4.9e-05,0.999373,0.000627,0,0,0,real,real


In [14]:
test_label_df = test_label_df.rename(columns={'label': 'Actual_Label'})
test_label_df.head()

Unnamed: 0,id,tweet,Actual_Label
0,1,Our daily update is published. States reported...,real
1,2,Alfalfa is the only cure for COVID-19.,fake
2,3,President Trump Asked What He Would Do If He W...,fake
3,4,States reported 630 deaths. We are still seein...,real
4,5,This is the sixth time a global health emergen...,real


In [23]:
# temp = test_label_df.merge(test_boosting_df, on='id', how='left')
# temp = temp[temp["Actual_Label"]!=temp["label"]].reset_index(drop=True)
# temp.head()

Unnamed: 0,id,tweet,Actual_Label,label
0,11,Two interesting correlations:\n\n1) Children t...,fake,real
1,28,Govt has added #Corona disease in all existing...,fake,real
2,182,In response to the pandemic Pennsylvania Gover...,real,fake
3,185,*DNA Vaccine: injecting genetic material into ...,real,fake
4,268,Keep your newborn more than 6 feet away from y...,fake,real


In [24]:
# temp.shape

(36, 4)

In [None]:
temp = val_boosting_df[val_boosting_df["Actual_Label"]!=val_boosting_df["label"]].reset_index(drop=True)[["id", "label"]]
temp = temp.rename(columns={"label": "predicted_label"})

In [None]:
wrong_df = temp.merge(val_df, on='id', how='inner')
wrong_df

Unnamed: 0,id,predicted_label,tweet,label
0,14,fake,Tomorrow April 6 we will pass 10000 coronaviru...,real
1,62,real,The Global #dietarysupplements market is antic...,fake
2,139,real,.@realdonaldtrump cherry-picked one stat to pr...,fake
3,146,real,There is no evidence that children have died b...,fake
4,149,real,The 2005 study found that chloroquine was effe...,fake
5,439,real,The N95 respirator offers the most protection ...,fake
6,497,real,In objection to WA's border closures @CliveFPa...,fake
7,610,fake,How deadly is the coronavirus #COVID19? Scient...,real
8,735,real,Health officials caution against the use of ch...,fake
9,758,real,How has alcohol consumption changed during loc...,fake


In [25]:
wrong_df.to_csv('../../Submissions/Wrong results/wrong_results_best_submission.csv', index=False)