In [1]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
train_boosting_df = pd.read_csv('../Boosting Data/Train_v1.csv')
val_boosting_df = pd.read_csv('../Boosting Data/Validation_v1.csv')
test_df = pd.read_csv('../../data/Constraint_Test.csv')
test_boosting_df = pd.read_csv('../../data/Boosting Data/Test_v1.csv')
test_df.head()

Unnamed: 0,id,tweet
0,1,Our daily update is published. States reported...
1,2,Alfalfa is the only cure for COVID-19.
2,3,President Trump Asked What He Would Do If He W...
3,4,States reported 630 deaths. We are still seein...
4,5,This is the sixth time a global health emergen...


In [3]:
def soft_voting(row):
  # class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'] + row['model4_Class0'])/4
  class0_val = (row['model1_Class0'] + row['model2_Class0'] + row['model3_Class0'])/3
  # class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'] + row['model4_Class1'])/4
  class1_val = (row['model1_Class1'] + row['model2_Class1'] + row['model3_Class1'])/3
  return 0 if class0_val > class1_val else 1

val_boosting_df['Soft_Vote'] = val_boosting_df.apply(lambda x: soft_voting(x), 1)
test_boosting_df['Soft_Vote'] = test_boosting_df.apply(lambda x: soft_voting(x), 1)


In [4]:
test_boosting_df.head(2)

Unnamed: 0,id,model1_Class0,model1_Class1,model2_Class0,model2_Class1,model3_Class0,model3_Class1,model4_Class0,model4_Class1,Soft_Vote
0,1,0.999851,0.000149,0.999063,0.000937,0.999967,3.3e-05,0.999963,3.7e-05,0
1,2,0.000422,0.999578,0.000902,0.999098,4.5e-05,0.999955,0.000112,0.999888,1


In [5]:
def hard_voting(row):
  a = np.argmax(np.array([[row['model1_Class0'], row['model1_Class1']]]), 1)[0]
  b = np.argmax(np.array([[row['model2_Class0'], row['model2_Class1']]]), 1)[0]
  c = np.argmax(np.array([[row['model3_Class0'], row['model3_Class1']]]), 1)[0]
  # d = np.argmax(np.array([[row['model4_Class0'], row['model4_Class1']]]), 1)[0]

  cnt_0 = 0
  # for i in [a,b,c,d]:
  for i in [a,b,c]:
    if i==0:
      cnt_0+=1
  
  if cnt_0>=2:
    return 0
  else:
    return 1

val_boosting_df['Hard_Vote'] = val_boosting_df.apply(lambda x: hard_voting(x), 1)
test_boosting_df['Hard_Vote'] = test_boosting_df.apply(lambda x: hard_voting(x), 1)


In [6]:
def label_id_to_label(row):
  return 'real' if row['Soft_Vote']==0 else 'fake'

val_boosting_df['label'] = val_boosting_df.apply(lambda x: label_id_to_label(x), 1)
test_boosting_df['label'] = test_boosting_df.apply(lambda x: label_id_to_label(x), 1)


In [7]:
test_df = pd.concat([test_df[["id"]], test_boosting_df[["label"]]], 1)

In [8]:
test_df.head()

Unnamed: 0,id,label
0,1,real
1,2,fake
2,3,fake
3,4,real
4,5,real


In [9]:
test_df.to_csv('../../Submission_v1.csv', index=False)

In [10]:
confusion_matrix(val_boosting_df['Label'].values, val_boosting_df['Soft_Vote'].values)

array([[1109,   11],
       [  26,  994]])

In [11]:
accuracy_score(val_boosting_df['Label'].values, val_boosting_df['Soft_Vote'].values)

0.9827102803738318

In [12]:
confusion_matrix(val_boosting_df['Label'].values, val_boosting_df['Hard_Vote'].values)

array([[1109,   11],
       [  28,  992]])

In [13]:
accuracy_score(val_boosting_df['Label'].values, val_boosting_df['Hard_Vote'].values)

0.9817757009345794