In [1]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_boosting_df = pd.read_csv('../Boosting Data/Train_v3.csv')
val_boosting_df = pd.read_csv('../Boosting Data/Validation_v3.csv')
train_df = pd.read_csv('../../data/Constraint_Train.csv')
val_df = pd.read_csv('../../data/Constraint_Val.csv')
test_df = pd.read_csv('../../data/Constraint_Test.csv')
test_label_df = pd.read_csv('../../data/english_test_with_labels.csv')
test_boosting_df = pd.read_csv('../Boosting Data/Test_v3.csv')


In [4]:
train_boosting_df.shape, val_boosting_df.shape, test_boosting_df.shape

((6420, 12), (2140, 12), (2140, 11))

In [5]:
ernie_val = pd.read_csv('../Intermediate ERNIE2.0/ERNIE/val_ernie2.0_results.csv')
ernie_val = ernie_val.rename(columns={'Model4_class0': 'Model6_class0', 'Model4_class1': 'Model6_class1'})[["Model6_class0", "Model6_class1", "Label"]]
ernie_test = pd.read_csv('../Intermediate ERNIE2.0/ERNIE/test_ernie2.0_results.csv')
ernie_test = ernie_test.rename(columns={'Model4_class0': 'Model6_class0', 'Model4_class1': 'Model6_class1'})[["Model6_class0", "Model6_class1"]]


In [6]:
test_boosting_df.head(3)

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,model5_Class0,model5_Class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999984,1.6e-05
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,6.7e-05,0.999933
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983,8.1e-05,0.999919


In [7]:
val_labels = val_boosting_df['Label'].values
val_boosting_df = pd.concat([val_boosting_df.iloc[:, :-1], ernie_val[["Model6_class0", "Model6_class1"]]], 1)
val_boosting_df["Label"] = val_labels
val_boosting_df.head()


Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,model5_Class0,model5_Class1,Model6_class0,Model6_class1,Label
0,1,0.001269,0.998731,0.000938,0.999062,0.000179,0.999821,0.028954,0.971046,7.4e-05,0.999926,0.00025,0.99975,1
1,2,0.991674,0.008326,0.000884,0.999116,0.010019,0.989981,0.002718,0.997282,0.958628,0.041371,0.005722,0.994278,1
2,3,0.000385,0.999615,0.000958,0.999042,8.4e-05,0.999916,5.3e-05,0.999947,4.8e-05,0.999952,0.000249,0.999751,1
3,4,0.000416,0.999584,0.000936,0.999065,0.000109,0.999891,0.001268,0.998732,0.001172,0.998828,0.000251,0.999749,1
4,5,0.999851,0.000149,0.999052,0.000948,0.999784,0.000216,0.999951,4.9e-05,0.999972,2.8e-05,0.999373,0.000627,0


In [8]:
test_boosting_df = pd.concat([test_boosting_df, ernie_test[["Model6_class0", "Model6_class1"]]], 1)
test_boosting_df.head()

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,model5_Class0,model5_Class1,Model6_class0,Model6_class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999984,1.6e-05,0.999371,0.000629
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,6.7e-05,0.999933,0.000257,0.999743
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983,8.1e-05,0.999919,0.000247,0.999753
3,4,0.999858,0.000142,0.999082,0.000918,0.999967,3.3e-05,0.999956,4.4e-05,0.999984,1.6e-05,0.999376,0.000624
4,5,0.999852,0.000148,0.999076,0.000924,0.999953,4.7e-05,0.999912,8.8e-05,0.999984,1.6e-05,0.994812,0.005188


In [9]:
ernie_val.shape, ernie_test.shape

((2140, 3), (2140, 2))

In [10]:
test_boosting_df.shape

(2140, 13)

In [11]:
test_boosting_df.head()

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,model5_Class0,model5_Class1,Model6_class0,Model6_class1
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999984,1.6e-05,0.999371,0.000629
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,6.7e-05,0.999933,0.000257,0.999743
2,3,0.000401,0.9996,0.000954,0.999046,3.3e-05,0.999967,0.00017,0.99983,8.1e-05,0.999919,0.000247,0.999753
3,4,0.999858,0.000142,0.999082,0.000918,0.999967,3.3e-05,0.999956,4.4e-05,0.999984,1.6e-05,0.999376,0.000624
4,5,0.999852,0.000148,0.999076,0.000924,0.999953,4.7e-05,0.999912,8.8e-05,0.999984,1.6e-05,0.994812,0.005188


# Taking various permutations of models for Soft Voting

In [12]:
def soft_voting(row):
  # class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'] + row['Model5_class0'])/5
  # class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'] + row['Model5_class1'])/5
  class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model5_Class0'] + row['Model6_class0'])/5
  class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model5_Class1'] + row['Model6_class1'])/5
  return 0 if class0_val > class1_val else 1

# train_boosting_df['Soft_Vote'] = train_boosting_df.apply(lambda x: soft_voting(x), 1)
val_boosting_df['Soft_Vote'] = val_boosting_df.apply(lambda x: soft_voting(x), 1)
test_boosting_df['Soft_Vote'] = test_boosting_df.apply(lambda x: soft_voting(x), 1)


In [13]:
test_boosting_df.head(2)

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,model5_Class0,model5_Class1,Model6_class0,Model6_class1,Soft_Vote
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0.999984,1.6e-05,0.999371,0.000629,0
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,6.7e-05,0.999933,0.000257,0.999743,1


# Taking various permutations of models for Hard Voting

In [14]:
def hard_voting(row, l):
  a = np.argmax(np.array([[row['model1_Class0'], row['model1_Class1']]]), 1)[0]
  b = np.argmax(np.array([[row['model2_Class0'], row['model2_Class1']]]), 1)[0]
  c = np.argmax(np.array([[row['model3_Class0'], row['model3_Class1']]]), 1)[0]
  d = np.argmax(np.array([[row['model4_Class0'], row['model4_Class1']]]), 1)[0]
  e = np.argmax(np.array([[row['model5_Class0'], row['model5_Class0']]]), 1)[0]
  f = np.argmax(np.array([[row['Model6_class0'], row['Model6_class1']]]), 1)[0]

  # cnt_0 = 0
  # for i in [a,b,c,d,e]:
  #   if i==0:
  #     cnt_0+=1
  
  # if cnt_0>=3:
  #   return 0
  # else:
  #   return 1
  cnt_0 = 0
  for i in [a,b,c,e,f]:
    if i==0:
      cnt_0+=1
  
  if l%2==0:
    if cnt_0==l:
      class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model5_Class0'] + row['Model6_class0'])/5
      class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model5_Class1'] + row['Model6_class0'])/5
      return 0 if class0_val > class1_val else 1
    elif cnt_0>l:
      return 0
    else:
      return 1
  else:
    if cnt_0>l//2:
      return 0
    else:
      return 1

no_of_models = 5
# train_boosting_df['Hard_Vote'] = train_boosting_df.apply(lambda x: hard_voting(x, no_of_models), 1)
val_boosting_df['Hard_Vote'] = val_boosting_df.apply(lambda x: hard_voting(x, no_of_models), 1)
test_boosting_df['Hard_Vote'] = test_boosting_df.apply(lambda x: hard_voting(x, no_of_models), 1)


In [15]:
def label_id_to_label(row, col):
  return 'real' if row[col]==0 else 'fake'

# train_boosting_df['predicted_label_Soft'] = train_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)
val_boosting_df['predicted_label_Soft'] = val_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)
test_boosting_df['predicted_label_Soft'] = test_boosting_df.apply(lambda x: label_id_to_label(x, 'Soft_Vote'), 1)

# train_boosting_df['predicted_label_Hard'] = train_boosting_df.apply(lambda x: label_id_to_label(x, 'Hard_Vote'), 1)
val_boosting_df['predicted_label_Hard'] = val_boosting_df.apply(lambda x: label_id_to_label(x, 'Hard_Vote'), 1)
test_boosting_df['predicted_label_Hard'] = test_boosting_df.apply(lambda x: label_id_to_label(x, 'Hard_Vote'), 1)


In [16]:
val_boosting_df = val_boosting_df[["id", "predicted_label_Soft", "predicted_label_Hard"]]
temp_val = val_df.merge(val_boosting_df, on='id', how='left')
temp_val.head()

Unnamed: 0,id,tweet,label,predicted_label_Soft,predicted_label_Hard
0,1,Chinese converting to Islam after realising th...,fake,fake,fake
1,2,11 out of 13 people (from the Diamond Princess...,fake,fake,fake
2,3,"COVID-19 Is Caused By A Bacterium, Not Virus A...",fake,fake,fake
3,4,Mike Pence in RNC speech praises Donald Trumpâ€™...,fake,fake,fake
4,5,6/10 Sky's @EdConwaySky explains the latest #C...,real,real,real


In [17]:
accuracy_score(temp_val['label'].values, temp_val['predicted_label_Soft'].values)

0.9841121495327103

In [18]:
precision_score(temp_val['label'].values, temp_val['predicted_label_Soft'].values, average='micro')

0.9841121495327103

In [19]:
recall_score(temp_val['label'].values, temp_val['predicted_label_Soft'].values, average='micro')

0.9841121495327103

In [20]:
f1_score(temp_val['label'].values, temp_val['predicted_label_Soft'].values, average='micro')

0.9841121495327103

In [21]:
accuracy_score(temp_val['label'].values, temp_val['predicted_label_Hard'].values)

0.9822429906542056

In [22]:
precision_score(temp_val['label'].values, temp_val['predicted_label_Hard'].values, average='micro')

0.9822429906542056

In [23]:
recall_score(temp_val['label'].values, temp_val['predicted_label_Hard'].values, average='micro')

0.9822429906542056

In [24]:
f1_score(temp_val['label'].values, temp_val['predicted_label_Hard'].values, average='micro')

0.9822429906542056

In [25]:
test_boosting_df = test_boosting_df[["id", "predicted_label_Soft", "predicted_label_Hard"]]
temp = test_label_df.merge(test_boosting_df, on='id', how='left')
temp.head()

Unnamed: 0,id,tweet,label,predicted_label_Soft,predicted_label_Hard
0,1,Our daily update is published. States reported...,real,real,real
1,2,Alfalfa is the only cure for COVID-19.,fake,fake,fake
2,3,President Trump Asked What He Would Do If He W...,fake,fake,fake
3,4,States reported 630 deaths. We are still seein...,real,real,real
4,5,This is the sixth time a global health emergen...,real,real,real


# Results

In [26]:
accuracy_score(temp['label'].values, temp['predicted_label_Soft'].values)

0.9808411214953271

In [27]:
precision_score(temp['label'].values, temp['predicted_label_Soft'].values, average='micro')

0.9808411214953271

In [28]:
recall_score(temp['label'].values, temp['predicted_label_Soft'].values, average='micro')

0.9808411214953271

In [29]:
f1_score(temp['label'].values, temp['predicted_label_Soft'].values, average='micro')

0.9808411214953271

In [30]:
accuracy_score(temp['label'].values, temp['predicted_label_Hard'].values)

0.9766355140186916

In [31]:
precision_score(temp['label'].values, temp['predicted_label_Hard'].values, average='micro')

0.9766355140186916

In [32]:
recall_score(temp['label'].values, temp['predicted_label_Hard'].values, average='micro')

0.9766355140186916

In [33]:
f1_score(temp['label'].values, temp['predicted_label_Hard'].values, average='micro')

0.9766355140186916