# CNN

## 0 - IMPORTS

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

## 1 - DATA LOAD

In [33]:
# 1.1 - load training data (FULL)
train_df = pd.read_csv('../data/processed/train_full.csv')
train_df = train_df.dropna(subset=['tweet'])
X = train_df['tweet']
y = train_df['label']

# 1.2 - split dataset into training, validation, and test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## 2 - TOKENIZATION + PADDING

In [34]:
# 2.1 - tokenization
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_val = tokenizer.texts_to_sequences(X_validation)
sequence_test = tokenizer.texts_to_sequences(X_test)
word2vec = tokenizer.word_index
V = len(word2vec)
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 2.2 - padding
data_train = pad_sequences(sequence_train)
T = data_train.shape[1]
data_val = pad_sequences(sequence_val, maxlen=T)
data_test = pad_sequences(sequence_test, maxlen=T)

# 2.3 - convert to PyTorch tensors
X_train = torch.tensor(data_train, dtype=torch.long)
y_train = torch.tensor(LabelEncoder().fit_transform(y_train), dtype=torch.float32)
X_val = torch.tensor(data_val, dtype=torch.long)
y_val = torch.tensor(LabelEncoder().fit_transform(y_validation), dtype=torch.float32)
X_test = torch.tensor(data_test, dtype=torch.long)
y_test = torch.tensor(LabelEncoder().fit_transform(y_test), dtype=torch.float32)

# 2.4 - create dataLoader
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=32)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

## 3 - CNN

### 3.1 - MODEL

In [29]:
# 3.1.0 - model (v2)
class CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.global_pool(F.relu(self.conv3(x))).squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [30]:
# 3.1.1 - initialize model
model = CNN(vocab_size=V+1, embed_dim=20)
# 3.1.2 - define loss
criterion = nn.BCEWithLogitsLoss()
# 3.1.3- define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

### 3.2 - TRAINING

In [19]:
# 3.2.0 - since overfitting was observed -> early stopping parameters
wait = 5
best_val_loss = np.inf
epochs_no_impr = 0
early_stop = False

# 3.2.1 - training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    
    train_accuracy = 100 * correct_train / total_train
    
    # 3.2.2 - validation
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts).squeeze(1)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = torch.round(torch.sigmoid(outputs))
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_accuracy = 100 * correct_val / total_val
    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}, Train Accuracy: {train_accuracy}%, '
          f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}%')

    # 3.2.3 - early stopping 
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_impr = 0
        torch.save(model.state_dict(), 'best_CNN_model.pt')
    else:
        epochs_no_impr += 1
        if epochs_no_impr >= wait:
            print('----> early stopped')
            early_stop = True
            break
    

Epoch 1, Train Loss: 0.4096545150249858, Train Accuracy: 80.74087002268898%, Validation Loss: 0.37800456500143437, Validation Accuracy: 82.56773192081806%
Epoch 2, Train Loss: 0.370668632401087, Train Accuracy: 83.22270558483014%, Validation Loss: 0.3640604670709347, Validation Accuracy: 83.32826577477636%
Epoch 3, Train Loss: 0.35998309602590445, Train Accuracy: 83.86466824927774%, Validation Loss: 0.35967883894184705, Validation Accuracy: 83.71113315943606%
Epoch 4, Train Loss: 0.3530455173528429, Train Accuracy: 84.28144139296447%, Validation Loss: 0.3569756394878785, Validation Accuracy: 83.74273872201508%
Epoch 5, Train Loss: 0.3476858452766666, Train Accuracy: 84.57804344662489%, Validation Loss: 0.354937363349195, Validation Accuracy: 84.03118948935013%
Epoch 6, Train Loss: 0.3438688021187192, Train Accuracy: 84.80278288839692%, Validation Loss: 0.35532579461519864, Validation Accuracy: 84.11440413512778%
Epoch 7, Train Loss: 0.34099982641178295, Train Accuracy: 85.0052684246085

### 3.3 - TESTING

In [31]:
# 3.3.0 - load best model
model.load_state_dict(torch.load('best_CNN_model.pt'))

# 3.3.1 - testing loop
model.eval()
test_loss = 0.0
correct_test = 0
total_test = 0
with torch.no_grad():
    for texts, labels in train_loader:
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Loss: {test_loss / len(train_loader)}, Test Accuracy: {test_accuracy}%')

Test Loss: 0.31956296302026455, Test Accuracy: 85.94333305495114%


## 4 - CNN-LSTM

### 4.1 - MODEL

In [20]:
# 4.1.0 - model (v3)
class CNN_LSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_hidden_dim, num_classes):
        super(CNN_LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(in_channels=embed_dim, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.lstm = nn.LSTM(128, lstm_hidden_dim, batch_first=True)
        self.fc1 = nn.Linear(lstm_hidden_dim, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1, 2)
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.global_pool(F.relu(self.conv3(x))).squeeze(2).unsqueeze(1)
        x, _ = self.lstm(x)
        x = x[:, -1, :]
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [21]:
# 4.1.1 - initialize model
model = CNN_LSTM(vocab_size=V+1, embed_dim=20, lstm_hidden_dim=128, num_classes=1)
# 4.1.2 - loss
criterion = nn.BCEWithLogitsLoss()
# 4.1.3 - optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 4.2 - TRAINING

In [22]:
# 4.2.0 - since overfitting was observed -> early stopping parameters
wait = 5
best_val_loss = np.inf
epochs_no_impr = 0
early_stop = False
# 4.2.1 - training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    
    train_accuracy = 100 * correct_train / total_train
    
    # 4.2.2 - validation
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts).squeeze(1)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = torch.round(torch.sigmoid(outputs))
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_accuracy = 100 * correct_val / total_val
    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}, Train Accuracy: {train_accuracy}%, '
          f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}%')

    # 4.2.3 - early stopping 
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_impr = 0
        torch.save(model.state_dict(), 'best_CNN_LSTM_model.pt')
    else:
        epochs_no_impr += 1
        if epochs_no_impr >= wait:
            print('----> early stopped')
            early_stop = True
            break
    

Epoch 1, Train Loss: 0.40722891923143284, Train Accuracy: 80.90429870442263%, Validation Loss: 0.38796472878477173, Validation Accuracy: 82.18886524028228%
Epoch 2, Train Loss: 0.3695256621184631, Train Accuracy: 83.26476296590052%, Validation Loss: 0.3656213274897976, Validation Accuracy: 83.3150634511674%
Epoch 3, Train Loss: 0.36238822561832096, Train Accuracy: 83.7508982826486%, Validation Loss: 0.36247427750586764, Validation Accuracy: 83.41148042055401%
Epoch 4, Train Loss: 0.3621384003599201, Train Accuracy: 83.85841715221021%, Validation Loss: 0.3660665560080739, Validation Accuracy: 83.38867640704764%
Epoch 5, Train Loss: 0.3741848522914965, Train Accuracy: 83.35482877244957%, Validation Loss: 0.3900517421487015, Validation Accuracy: 81.76879130727008%
Epoch 6, Train Loss: 0.3801707312753247, Train Accuracy: 83.16184490378062%, Validation Loss: 0.38448392930385766, Validation Accuracy: 83.02181183888365%
Epoch 7, Train Loss: 0.40574306515199715, Train Accuracy: 82.286641305549

### 4.3 - TESTING

In [23]:
# 4.3.0 - load best model
model.load_state_dict(torch.load('best_CNN_LSTM_model.pt'))

# 4.3.1 - testing loop
model.eval()
test_loss = 0.0
correct_test = 0
total_test = 0
with torch.no_grad():
    for texts, labels in train_loader:
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Loss: {test_loss / len(train_loader)}, Test Accuracy: {test_accuracy}%')

Test Loss: 0.3526623520912514, Test Accuracy: 84.02204586905002%


## 5 - LSTM-CNN

### 5.1 - MODEL

In [24]:
# 5.1.0 - model (v3)
class LSTM_CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, lstm_hidden_dim, num_classes):
        super(LSTM_CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, lstm_hidden_dim, batch_first=True)
        self.conv1 = nn.Conv1d(in_channels=lstm_hidden_dim, out_channels=32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(128, 256)
        self.fc2 = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x.transpose(1, 2)
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.global_pool(F.relu(self.conv3(x))).squeeze(2)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [25]:
# 5.1.1 - initialize model
model = LSTM_CNN(vocab_size=V+1, embed_dim=20, lstm_hidden_dim=128, num_classes=1)
# 5.1.2- loss
criterion = nn.BCEWithLogitsLoss()
# 5.1.3 - optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

### 5.2 - TRAINING

In [26]:
# 5.2.0 - since overfitting was observed -> early stopping parameters
wait = 5
best_val_loss = np.inf
epochs_no_impr = 0
early_stop = False

# 5.2.1 - training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    correct_train = 0
    total_train = 0
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()
    
    train_accuracy = 100 * correct_train / total_train
    
    # 5.2.2 - validation
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts).squeeze(1)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            predicted = torch.round(torch.sigmoid(outputs))
            total_val += labels.size(0)
            correct_val += (predicted == labels).sum().item()

    val_accuracy = 100 * correct_val / total_val
    avg_val_loss = val_loss / len(val_loader)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss / len(train_loader)}, Train Accuracy: {train_accuracy}%, '
          f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}%')

    # 5.2.3 - early stopping 
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_impr = 0
        torch.save(model.state_dict(), 'best_LSTM_CNN_model.pt')
    else:
        epochs_no_impr += 1
        if epochs_no_impr >= wait:
            print('----> early stopped')
            early_stop = True
            break
    

Epoch 1, Train Loss: 0.3912068454026794, Train Accuracy: 81.89592273443989%, Validation Loss: 0.3576830866895864, Validation Accuracy: 83.80434956552354%
Epoch 2, Train Loss: 0.34587015811801614, Train Accuracy: 84.62010082769525%, Validation Loss: 0.3423587308535653, Validation Accuracy: 84.68530461361199%
Epoch 3, Train Loss: 0.33197585107717803, Train Accuracy: 85.33257586706468%, Validation Loss: 0.33768707914854923, Validation Accuracy: 84.81692777928916%
Epoch 4, Train Loss: 0.3235973163206976, Train Accuracy: 85.78415511922343%, Validation Loss: 0.3374705841474872, Validation Accuracy: 85.03296580198115%
Epoch 5, Train Loss: 0.31804365098092197, Train Accuracy: 86.11491316726085%, Validation Loss: 0.3338499033811211, Validation Accuracy: 85.08737537806654%
Epoch 6, Train Loss: 0.3132859758202003, Train Accuracy: 86.34105285477601%, Validation Loss: 0.3383933071302669, Validation Accuracy: 84.98935812703036%
Epoch 7, Train Loss: 0.30986553956688184, Train Accuracy: 86.51373316016

### 5.3 - TESTING

In [27]:
# 5.3.0 - load best model
model.load_state_dict(torch.load('best_LSTM_CNN_model.pt'))

# 5.3.1 - testing loop
model.eval()
test_loss = 0.0
correct_test = 0
total_test = 0
with torch.no_grad():
    for texts, labels in train_loader:
        outputs = model(texts).squeeze(1)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        predicted = torch.round(torch.sigmoid(outputs))
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

test_accuracy = 100 * correct_test / total_test
print(f'Test Loss: {test_loss / len(train_loader)}, Test Accuracy: {test_accuracy}%')

Test Loss: 0.302466902563521, Test Accuracy: 86.78243031652055%
