In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## Data Preprocessing

In [2]:
# Load in the data
fake_news = pd.read_csv('data/a2_Fake.csv')
true_news = pd.read_csv('data/a1_True.csv')

# Add labels to each dataframe where 1 is fake and 0 is true
fake_news['label'] = 1
true_news['label'] = 0

# Concatenate the two dataframes
news = pd.concat([fake_news, true_news], axis=0)

In [3]:
# Split the data into training, validation, and testing sets
train_df, temp_df = train_test_split(news, test_size=0.3, random_state=42, shuffle=True)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, shuffle=True)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
class FakeNewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.inputs = dataframe['title']
        self.labels = dataframe['label']
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        input_text = self.inputs.iloc[idx]
        tokenized_inputs = self.tokenizer.encode(
            input_text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        label = self.labels.iloc[idx]
        return tokenized_inputs, torch.tensor(label, dtype=torch.long)

In [6]:
train_dataset = FakeNewsDataset(train_df, tokenizer, 64)
val_dataset = FakeNewsDataset(val_df, tokenizer, 64)
test_dataset = FakeNewsDataset(test_df, tokenizer, 64)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Model Arcitecture

In [7]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=2, dropout=0.2, bidirectional=True)
        self.batchnorm = nn.BatchNorm1d(hidden_dim * 2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded) # Don't need to save the final hidden states from the LSTM block
        lstm_out = self.batchnorm(lstm_out.permute(0, 2, 1)).permute(0, 2, 1)
        out = self.fc(lstm_out[:, -1, :])
        return out

## Train and Test Loops

In [8]:
def trainloop(dataloader, model, loss_func, optimizer):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    num_batches = len(dataloader)
    train_loss, train_acc = 0. , 0.
    model.train()
    for inputs, labels in dataloader:
        inputs = inputs.squeeze(1).to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_func(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).sum().item()
    return train_loss / num_batches, train_acc / len(dataloader.dataset)
    

def testloop(dataloader, model, loss_func):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    num_batches = len(dataloader)
    test_loss, test_acc = 0. , 0.
    model.eval()
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs = inputs.squeeze(1).to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = loss_func(outputs, labels)

            test_loss += loss.item()
            test_acc += (outputs.argmax(dim=1) == labels).sum().item()
    return test_loss / num_batches, test_acc / len(dataloader.dataset)

In [9]:
# Hyperparameters
lr = 1e-4
batch_size = 32
epochs = 10
max_len = 64

In [10]:
model = LSTM(vocab_size=tokenizer.vocab_size, embedding_dim=128, hidden_dim=64, output_dim=2)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss()

model.to('cuda' if torch.cuda.is_available() else 'cpu')

LSTM(
  (embedding): Embedding(30522, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [11]:
for epoch in range(epochs):
    print(f"---Epoch {epoch+1}/{epochs}:---")
    train_loss, train_acc = trainloop(train_loader, model, loss_func, optimizer)
    val_loss, val_acc = testloop(val_loader, model, loss_func)
    print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}.')
    print(f'Validation Loss: {val_loss:.4f}, Validation Acc: {val_acc:.4f}.')


---Epoch 1/10:---
Train Loss: 0.6595, Train Acc: 0.5672.
Validation Loss: 0.5944, Validation Acc: 0.6998.
---Epoch 2/10:---
Train Loss: 0.2576, Train Acc: 0.8938.
Validation Loss: 0.1726, Validation Acc: 0.9326.
---Epoch 3/10:---
Train Loss: 0.1316, Train Acc: 0.9512.
Validation Loss: 0.1330, Validation Acc: 0.9504.
---Epoch 4/10:---
Train Loss: 0.1000, Train Acc: 0.9638.
Validation Loss: 0.1327, Validation Acc: 0.9549.
---Epoch 5/10:---
Train Loss: 0.0804, Train Acc: 0.9710.
Validation Loss: 0.0985, Validation Acc: 0.9626.
---Epoch 6/10:---
Train Loss: 0.0653, Train Acc: 0.9771.
Validation Loss: 0.0931, Validation Acc: 0.9660.
---Epoch 7/10:---
Train Loss: 0.0529, Train Acc: 0.9820.
Validation Loss: 0.1092, Validation Acc: 0.9596.
---Epoch 8/10:---
Train Loss: 0.0427, Train Acc: 0.9863.
Validation Loss: 0.0945, Validation Acc: 0.9706.
---Epoch 9/10:---
Train Loss: 0.0336, Train Acc: 0.9897.
Validation Loss: 0.0966, Validation Acc: 0.9706.
---Epoch 10/10:---
Train Loss: 0.0268, Train A

In [12]:
test_loss, test_acc = testloop(test_loader, model, loss_func)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

Test Loss: 0.09186164992287649, Test Accuracy: 0.9738678544914625


In [14]:
torch.save(model.state_dict(), "./model_parameters/LSTM_single_block.pth")