# Dependency Installation and Repository Cloning

### Run it if you're using this notebook in Google Colab

In [None]:
!git clone 'https://github.com/dakopecky/nlp-course-itmo.git'

%cd nlp-course-itmo
!git checkout hw4
%cd hw4

!pip install poetry
!poetry remove torch
!poetry config virtualenvs.create false
!poetry install --no-ansi

# Realization of classification

Import deps

In [9]:
# This code includes software developed by the following open-source projects:
# - nltk (License: Apache License 2.0, Authors: Steven Bird, Edward Loper, Ewan Klein)
# - gensim (License: LGPL-2.1 License, Authors: Radim Rehurek, Petr Sojka and Gensim Contributors)
# - numpy (License: BSD-3-Clause license, Authors: NumPy Developers)
# - Pandas (License: BSD-3-Clause License, Authors: Pandas Development Team)
# - tqdm (License: MIT License, Authors: Noam Yorav-Raphael)
# - scikit-learn (License: BSD License, Authors: scikit-learn Developers)
# - PyTorch (License: Custom License, Authors: Facebook, Inc, Adam Paszke, Soumith Chintala, Ronan Collobert, Koray Kavukcuoglu, Clement Farabet, and other contributors)
# - Jupyter Notebook (License: Modified BSD License, Authors: Project Jupyter)
# For the full license information, please see the `licenses` directory.

import numpy as np
import pandas as pd

import re
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Download the "Fake and real news" dataset

In [3]:
!wget -q 'https://www.dropbox.com/scl/fi/ywll03cyd4e95ztcbe3gn/Fake.csv?rlkey=ucfg2ztrv8ub7w4yd6gx259p2&dl=1' -O 'Fake.csv'
!wget -q 'https://www.dropbox.com/scl/fi/fiwo2lnbd1y8ywvas12jd/True.csv?rlkey=5fxgepi5uvd4sr9y0tk46aw5o&dl=1' -O 'True.csv'

Define random state for reproducing

In [None]:
RANDOM_STATE = 42
torch.manual_seed(RANDOM_STATE)

## Load the dataset

In [5]:
fake_news_df = pd.read_csv('Fake.csv')
true_news_df = pd.read_csv('True.csv')

fake_news_df['label'] = 0
true_news_df['label'] = 1

news_df = pd.concat([fake_news_df, true_news_df], ignore_index=True)

news_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


## Preprocessing and Vectorization

Clean

In [None]:
nltk.download('punkt')

In [11]:
news_df['full_text'] = news_df['title'] + ' ' + news_df['text']

def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    return tokens

news_df['tokenized_text'] = news_df['full_text'].apply(clean_and_tokenize)

Split into test and training samples

In [12]:
X_train, X_test, y_train, y_test = train_test_split(news_df['tokenized_text'], news_df['label'], test_size=0.2, random_state=RANDOM_STATE)

Vectorize

In [13]:
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=1) # 1 worker for reproducing

def text_to_sequence(tokenized_text, max_length):
    sequence = np.zeros((max_length, 100))
    for i, word in enumerate(tokenized_text[:max_length]):
        if word in word2vec_model.wv:
            sequence[i] = word2vec_model.wv[word]
    return sequence

max_length = 100
X_train_seq = np.array([text_to_sequence(text, max_length) for text in X_train])
X_test_seq = np.array([text_to_sequence(text, max_length) for text in X_test])

y_train_seq = y_train.values
y_test_seq = y_test.values

## Training

In [14]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_seq, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

CNN

In [15]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(100, 128, 5)
        self.pool = nn.MaxPool1d(5)
        self.fc1 = nn.Linear(128 * ((max_length - 5 + 1) // 5), 1)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = torch.sigmoid(x)
        return x


model_cnn = CNN()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)
criterion = nn.BCELoss()

epochs = 5

for epoch in range(epochs):
    epoch_loss = 0
    for batch_X, batch_y in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()
        outputs = model_cnn(batch_X.permute(0, 2, 1))
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Avg Loss: {epoch_loss / len(train_dataloader)}')

Epoch 1/5: 100%|██████████| 1123/1123 [00:22<00:00, 50.44it/s]


Avg Loss: 0.020030665539246293


Epoch 2/5: 100%|██████████| 1123/1123 [00:21<00:00, 51.22it/s]


Avg Loss: 0.002669221999549429


Epoch 3/5: 100%|██████████| 1123/1123 [00:21<00:00, 51.18it/s]


Avg Loss: 0.0030203063233189315


Epoch 4/5: 100%|██████████| 1123/1123 [00:22<00:00, 49.78it/s]


Avg Loss: 0.0021146706733790248


Epoch 5/5: 100%|██████████| 1123/1123 [00:25<00:00, 44.13it/s]

Avg Loss: 0.0015718439696766982





LSTM

In [16]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(100, 128, batch_first=True)
        self.fc1 = nn.Linear(128, 1)

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), 128)
        c_0 = torch.zeros(1, x.size(0), 128)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc1(out[:, -1, :])
        out = torch.sigmoid(out)
        return out


model_lstm = LSTM()
optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)
criterion = nn.BCELoss()

epochs = 5

for epoch in range(epochs):
    epoch_loss = 0
    for batch_X, batch_y in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()
        outputs = model_lstm(batch_X)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Avg Loss: {epoch_loss / len(train_dataloader)}')

Epoch 1/5: 100%|██████████| 1123/1123 [00:46<00:00, 24.07it/s]


Avg Loss: 0.25143002117017843


Epoch 2/5: 100%|██████████| 1123/1123 [00:52<00:00, 21.59it/s]


Avg Loss: 0.03036899444583276


Epoch 3/5: 100%|██████████| 1123/1123 [00:45<00:00, 24.57it/s]


Avg Loss: 0.012373617864394261


Epoch 4/5: 100%|██████████| 1123/1123 [00:45<00:00, 24.73it/s]


Avg Loss: 0.010049907859345982


Epoch 5/5: 100%|██████████| 1123/1123 [00:46<00:00, 24.33it/s]

Avg Loss: 0.008452688533269973





## Evaluation

In [17]:
def evaluate_model(model, X_test_tensor, y_test_tensor):
    with torch.no_grad():
        outputs = model(X_test_tensor.permute(0, 2, 1) if isinstance(model, CNN) else X_test_tensor)
        predictions = (outputs.squeeze() > 0.5).float()

    accuracy = accuracy_score(y_test_tensor, predictions)
    precision = precision_score(y_test_tensor, predictions)
    recall = recall_score(y_test_tensor, predictions)
    f1 = f1_score(y_test_tensor, predictions)

    return accuracy, precision, recall, f1


accuracy_cnn, precision_cnn, recall_cnn, f1_cnn = evaluate_model(model_cnn, X_test_tensor, y_test_tensor)
print(f"CNN - Accuracy: {accuracy_cnn}, Precision: {precision_cnn}, Recall: {recall_cnn}, F1: {f1_cnn}")

accuracy_lstm, precision_lstm, recall_lstm, f1_lstm = evaluate_model(model_lstm, X_test_tensor, y_test_tensor)
print(f"LSTM - Accuracy: {accuracy_lstm}, Precision: {precision_lstm}, Recall: {recall_lstm}, F1: {f1_lstm}")

CNN - Accuracy: 0.9989977728285078, Precision: 0.9995285242809995, Recall: 0.9983517777254532, F1: 0.9989398044528213
LSTM - Accuracy: 0.998218262806236, Precision: 0.9967128433904673, Recall: 0.9995290793501295, F1: 0.9981189748412885


In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


params_cnn = count_parameters(model_cnn)
print(f"Total trainable parameters for CNN: {params_cnn}")

params_lstm = count_parameters(model_lstm)
print(f"Total trainable parameters for LSTM: {params_lstm}")

Total trainable parameters for CNN: 66561
Total trainable parameters for LSTM: 117889
