In [70]:
#Importing necessary libraries

import re
import scipy
import pandas         as pd
import io
import numpy          as np
import copy

import torch

from sklearn.metrics                  import classification_report
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.model_selection import train_test_split
from torch                            import nn, optim
from torch.utils                      import data

#Seeding for deterministic results
RANDOM_SEED = 16
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
HIDDEN_LAYER_UNITS = 60

CLASS_NAMES = ['no-fake', 'fake']
EPOCHS      = 10


In [17]:
train = pd.read_csv('preprocessed.csv')
train.drop(columns=['id', 'statement','subject','speaker'], inplace=True)

In [18]:
train.head()

Unnamed: 0,label,economy,health-care,taxes,federal-budget,education,jobs,state-budget,candidates-biography,elections,...,state_info_Virginia,"state_info_Washington, D.C.",state_info_Wisconsin,state_info_other,party_affiliation_democrat,party_affiliation_independent,party_affiliation_none,party_affiliation_organization,party_affiliation_other,party_affiliation_republican
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0.0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,0.0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
3,1.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.0,0,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1


In [19]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('label', axis=1), train.label, test_size=0.2, random_state=RANDOM_SEED)

In [20]:
y_test = y_test.astype(int)
y_train = y_train.astype(int)

In [21]:
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.2, random_state=RANDOM_SEED)

In [24]:
y_train= y_train.tolist()
y_test= y_test.tolist()
y_dev= y_dev.tolist()

In [33]:
x_train = torch.tensor(X_train.values).float()
x_dev = torch.tensor(X_dev.values).float()
x_test = torch.tensor(X_test.values).float()

In [34]:
#Converting prections for train, dev and test data to tensors
y_train = torch.tensor(y_train)
y_dev   = torch.tensor(y_dev)
y_test  = torch.tensor(y_test)

In [37]:
x_train.shape[1]

119

In [84]:
class Nn(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Inputs to hidden layer linear transformation
        self.hidden  = nn.Linear(x_train.shape[1], HIDDEN_LAYER_UNITS)
        # Output layer
        self.output  =  nn.Linear(HIDDEN_LAYER_UNITS, len(CLASS_NAMES))
        self.dropout = nn.Dropout(0.1)
        
        # Defining tanh activation and softmax output 
        self.tanh    = nn.Tanh()                                     #Using tanh as it performed better than ReLu during hyper-param optimisation
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # Pass the input tensor through each of the below operations
        x = self.hidden(x)

        #print(x.shape)
        y = self.tanh(x)
        #print(y.shape)
        z = self.dropout(y)
        #print(z.shape)
        z = self.output(z)
        #print(z.shape)
        z = self.softmax(z)
        
        #returning the output from hidden layer and the output layer
        return  y, z
    

In [85]:
#Defining the model
model = Nn()

# Defining the loss
'''Using class-weights to accomodate heavily imbalanced data. 
These weights were learnt by running several experiments using 
other weights and the weights that produced the best results have
 finally been used here'''

weights       = [1.0, 1.0]
class_weights = torch.FloatTensor(weights)
criterion     = nn.CrossEntropyLoss(weight = class_weights)


# Forward pass, get our logits
hidden_state_output, classfier_output = model(x_train)
print(classfier_output)
print(classfier_output[0].shape)

loss = criterion(classfier_output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.02)

tensor([[0.5226, 0.4774],
        [0.5081, 0.4919],
        [0.4573, 0.5427],
        ...,
        [0.4282, 0.5718],
        [0.4289, 0.5711],
        [0.4538, 0.5462]], grad_fn=<SoftmaxBackward>)
torch.Size([2])


In [86]:
#Training the model on training data and evaluating it on development set
#%%time
def train_model():
  train_losses = []
  dev_losses = []
  dev_accuracies = []

  for e in range(EPOCHS):
    correct_predictions = 0
    optimizer.zero_grad()

    hidden_layer_output, classifier_output = model.forward(x_train)

    loss = criterion(classifier_output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()
    with torch.no_grad():
        model.eval()

        #Getting hidden layer and softmax output from model for dev data
        hidden_layer_output, classifier_output = model(x_dev)
        
        #Calculating loss
        dev_loss = criterion(classifier_output, y_dev)
        dev_losses.append(dev_loss)

        #Calculating values predicted by the model
        _, preds = torch.max(classifier_output, dim=1)
        correct_predictions += torch.sum(preds == y_dev)
        
        #Calculating accuracy
        dev_accuracy = correct_predictions.double() / len(y_dev)
        dev_accuracies.append(dev_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{EPOCHS}.. ",
          f"Training Loss: {dev_loss:.3f}.. ",
          f"Dev Loss: {dev_loss:.3f}.. ",
          f"Dev Accuracy: {dev_accuracy:.3f}")


train_model()

Epoch: 1/10..  Training Loss: 0.648..  Dev Loss: 0.648..  Dev Accuracy: 0.654
Epoch: 2/10..  Training Loss: 0.639..  Dev Loss: 0.639..  Dev Accuracy: 0.654
Epoch: 3/10..  Training Loss: 0.639..  Dev Loss: 0.639..  Dev Accuracy: 0.654
Epoch: 4/10..  Training Loss: 0.637..  Dev Loss: 0.637..  Dev Accuracy: 0.655
Epoch: 5/10..  Training Loss: 0.633..  Dev Loss: 0.633..  Dev Accuracy: 0.654
Epoch: 6/10..  Training Loss: 0.632..  Dev Loss: 0.632..  Dev Accuracy: 0.654
Epoch: 7/10..  Training Loss: 0.636..  Dev Loss: 0.636..  Dev Accuracy: 0.631
Epoch: 8/10..  Training Loss: 0.639..  Dev Loss: 0.639..  Dev Accuracy: 0.627
Epoch: 9/10..  Training Loss: 0.636..  Dev Loss: 0.636..  Dev Accuracy: 0.632
Epoch: 10/10..  Training Loss: 0.633..  Dev Loss: 0.633..  Dev Accuracy: 0.652


In [87]:
'''This function gets the predictions for each data point 
in the deevelopment and the training set'''

def get_predictions(model, x_test, y_test):

  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    model.eval()
    labels = y_test

    #Currently, not interested in the hidden layer outputs.
    _,classifier_output = model(x_test)

    #Not interested in the maximum values, interested with the indices of these max values
    _, preds = torch.max(classifier_output, dim=1)

    predictions.extend(preds)
    prediction_probs.extend(classifier_output)
    real_values.extend(labels)
  predictions = torch.stack(predictions)

  prediction_probs = torch.stack(prediction_probs)
  real_values = torch.stack(real_values)
  return  predictions, prediction_probs, real_values

In [88]:
#Getting predictions for the development set
y_pred_dev, y_pred_probs, y_true_dev = get_predictions(
  model,
  x_dev, 
  y_dev
)

In [89]:
#Printing the classifictaion report for the Development set
print(classification_report(y_true_dev, y_pred_dev ,digits =4, target_names=CLASS_NAMES))

              precision    recall  f1-score   support

     no-fake     0.4933    0.2238    0.3079       496
        fake     0.6810    0.8782    0.7671       936

    accuracy                         0.6515      1432
   macro avg     0.5872    0.5510    0.5375      1432
weighted avg     0.6160    0.6515    0.6081      1432



In [90]:
#Getting the predictions for the test set
y_pred_test, y_pred_probs, y_true_test = get_predictions(
  model,
  x_test, 
  y_test
)

In [91]:
print(classification_report(y_true_test, y_pred_test , digits = 4,  target_names=CLASS_NAMES))

              precision    recall  f1-score   support

     no-fake     0.5157    0.2256    0.3139       656
        fake     0.6620    0.8774    0.7546      1134

    accuracy                         0.6385      1790
   macro avg     0.5888    0.5515    0.5343      1790
weighted avg     0.6084    0.6385    0.5931      1790

