# Section 1: Fundamentals in RNNs

## 1.1 Import Libraries

In [40]:
import torch
import torch.nn as nn
import numpy as np

## 1.2 Declare Variables

In [41]:
input_size = 5
seq_len = 4
batch_size = 8
hidden_size = 3
num_classes = 3

## 1.3 Create Random Inputs and Labels

In [42]:
# Create random inputs and labels
inputs = torch.randn(batch_size, seq_len, input_size, requires_grad=True)
random_labels = torch.randint(0, num_classes, (batch_size,))
print("Input shape:", inputs.shape)
print("Labels:", random_labels)

Input shape: torch.Size([8, 4, 5])
Labels: tensor([2, 0, 1, 0, 2, 2, 0, 1])


## 1.4 Initialize Model Parameters

In [43]:
# Initialize model parameters with requires_grad=True
# U: Input-to-hidden weights [input_size, hidden_size]
U = torch.randn(input_size, hidden_size, requires_grad=True)

# W: Hidden-to-hidden weights [hidden_size, hidden_size]
W = torch.randn(hidden_size, hidden_size, requires_grad=True)

# b: Hidden bias [hidden_size]
b = torch.zeros(hidden_size, requires_grad=True)

# V: Output weights [hidden_size, num_classes]
V = torch.randn(hidden_size, num_classes, requires_grad=True)

# c: Output bias [num_classes]
c = torch.zeros(num_classes, requires_grad=True)

print(f"U shape: {U.shape}")
print(f"W shape: {W.shape}")
print(f"b shape: {b.shape}")
print(f"V shape: {V.shape}")
print(f"c shape: {c.shape}")

U shape: torch.Size([5, 3])
W shape: torch.Size([3, 3])
b shape: torch.Size([3])
V shape: torch.Size([3, 3])
c shape: torch.Size([3])


## 1.5 Compute Hidden States

In [44]:
# Store hidden states in a list (better for autograd)
hidden_states = []

# Compute hidden states for each timestep
for t in range(seq_len):
    if t == 0:
        # First timestep: h_0 = tanh(x_0 @ U + b)
        h_t = torch.tanh(inputs[:, t, :] @ U + b)
    else:
        # Subsequent timesteps: h_t = tanh(x_t @ U + h_{t-1} @ W + b)
        h_prev = hidden_states[t-1]
        h_t = torch.tanh(inputs[:, t, :] @ U + h_prev @ W + b)

    hidden_states.append(h_t)

# Stack hidden states into a 3D tensor for convenience
# This maintains the computation graph
hiddens = torch.stack(hidden_states, dim=1)

print("Hiddens shape:", hiddens.shape)
print("Sample hidden state at last timestep:")
print(hiddens[0, -1, :])

Hiddens shape: torch.Size([8, 4, 3])
Sample hidden state at last timestep:
tensor([-1.0000,  0.8771, -0.9624], grad_fn=<SliceBackward0>)


## 1.6 Compute Logits

In [45]:
# Extract the last hidden state
last_hidden = hiddens[:, -1, :]  # [batch_size, hidden_size]

# Compute logits: logits = h_T @ V + c
logits = last_hidden @ V + c  # [batch_size, num_classes]

print("Logits shape:", logits.shape)
print("Sample logits:")
print(logits[0])

Logits shape: torch.Size([8, 3])
Sample logits:
tensor([-2.2371,  1.4742,  1.2297], grad_fn=<SelectBackward0>)


## 1.7 Compute Cross-Entropy Loss

In [46]:
# Define cross-entropy loss function
criterion = nn.CrossEntropyLoss()

# Compute loss
loss = criterion(logits, random_labels)

print(f"Cross-Entropy Loss: {loss.item():.4f}")

Cross-Entropy Loss: 1.0832


## 1.8 Backpropagation

In [47]:
# Perform backpropagation
# This computes gradients for all parameters with requires_grad=True
loss.backward()

# Display computed gradients
print("Gradient shapes:")
print(f"U.grad shape: {U.grad.shape}")
print(f"W.grad shape: {W.grad.shape}")
print(f"b.grad shape: {b.grad.shape}")
print(f"V.grad shape: {V.grad.shape}")
print(f"c.grad shape: {c.grad.shape}")

print("\nSample gradient values:")
print(f"W.grad sample:\n{W.grad[:2, :2]}")
print(f"\nb.grad: {b.grad}")

# Verify gradients exist
assert U.grad is not None, "U gradient not computed!"
assert W.grad is not None, "W gradient not computed!"
assert b.grad is not None, "b gradient not computed!"
assert V.grad is not None, "V gradient not computed!"
assert c.grad is not None, "c gradient not computed!"
print("\n✓ All gradients computed successfully!")

Gradient shapes:
U.grad shape: torch.Size([5, 3])
W.grad shape: torch.Size([3, 3])
b.grad shape: torch.Size([3])
V.grad shape: torch.Size([3, 3])
c.grad shape: torch.Size([3])

Sample gradient values:
W.grad sample:
tensor([[-0.1413,  0.0902],
        [ 0.5686, -0.5391]])

b.grad: tensor([ 0.4246, -0.2044, -0.1327])

✓ All gradients computed successfully!


## 1.9 Manual SGD Update

In [48]:
# Learning rate
learning_rate = 0.1

# Store old values for comparison
U_old = U.data.clone()
W_old = W.data.clone()
b_old = b.data.clone()
V_old = V.data.clone()
c_old = c.data.clone()

# Manual SGD update: θ_new = θ_old - η * ∇θ
# Use torch.no_grad() to prevent tracking these operations
with torch.no_grad():
    U -= learning_rate * U.grad
    W -= learning_rate * W.grad
    b -= learning_rate * b.grad
    V -= learning_rate * V.grad
    c -= learning_rate * c.grad

print("Parameters updated successfully!")
print("\nParameter changes (L2 norm of difference):")
print(f"U change norm: {torch.norm(U.data - U_old).item():.6f}")
print(f"W change norm: {torch.norm(W.data - W_old).item():.6f}")
print(f"b change norm: {torch.norm(b.data - b_old).item():.6f}")
print(f"V change norm: {torch.norm(V.data - V_old).item():.6f}")
print(f"c change norm: {torch.norm(c.data - c_old).item():.6f}")

# Show that parameters actually changed
print("\nExample: First element of W before and after:")
print(f"Before: {W_old[0, 0].item():.6f}")
print(f"After:  {W.data[0, 0].item():.6f}")
print(f"Change: {(W.data[0, 0] - W_old[0, 0]).item():.6f}")

Parameters updated successfully!

Parameter changes (L2 norm of difference):
U change norm: 0.168781
W change norm: 0.103965
b change norm: 0.048954
V change norm: 0.027173
c change norm: 0.012614

Example: First element of W before and after:
Before: 0.238311
After:  0.252442
Change: 0.014131


# Section 2: Deep Learning for Sequential Data

## 2.1 Import Libraries and Set Random Seeds

In [49]:
import os
import torch
import random
import requests
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import os
from six.moves.urllib.request import urlretrieve
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def seed_all(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_all(seed=1234)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## 2.2 Download and Preprocess the Data
### DataManager Class Implementation

In [50]:
class DataManager:
    """
    This class manages and preprocesses a simple text dataset for a sentence classification task.

    Attributes:
        verbose (bool): Controls verbosity for printing information during data processing.
        max_sentence_len (int): The maximum length of a sentence in the dataset.
        str_questions (list): A list to store the string representations of the questions in the dataset.
        str_labels (list): A list to store the string representations of the labels in the dataset.
        numeral_labels (list): A list to store the numerical representations of the labels in the dataset.
        numeral_data (list): A list to store the numerical representations of the questions in the dataset.
        random_state (int): Seed value for random number generation to ensure reproducibility.
        random (np.random.RandomState): Random number generator object initialized with the given random_state.
    """

    def __init__(self, verbose=True, random_state=6789):
        self.verbose = verbose
        self.max_sentence_len = 0
        self.str_questions = list()
        self.str_labels = list()
        self.numeral_labels = list()
        self.maxlen = None
        self.numeral_data = list()
        self.random_state = random_state
        self.random = np.random.RandomState(random_state)

    @staticmethod
    def maybe_download(dir_name, file_name, url, verbose=True):
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        if not os.path.exists(os.path.join(dir_name, file_name)):
            urlretrieve(url + file_name, os.path.join(dir_name, file_name))
        if verbose:
            print("Downloaded successfully {}".format(file_name))

    def read_data(self, dir_name, file_names):
        self.str_questions = list()
        self.str_labels = list()
        for file_name in file_names:
            file_path= os.path.join(dir_name, file_name)
            with open(file_path, "r", encoding="latin-1") as f:
                for row in f:
                    row_str = row.split(":")
                    label, question = row_str[0], row_str[1]
                    question = question.lower()
                    self.str_labels.append(label)
                    self.str_questions.append(question[0:-1])
                    if self.max_sentence_len < len(self.str_questions[-1]):
                        self.max_sentence_len = len(self.str_questions[-1])

        # turns labels into numbers
        le = preprocessing.LabelEncoder()
        le.fit(self.str_labels)
        self.numeral_labels = np.array(le.transform(self.str_labels))
        self.str_classes = le.classes_
        self.num_classes = len(self.str_classes)
        if self.verbose:
            print("\nSample questions and corresponding labels... \n")
            print(self.str_questions[0:5])
            print(self.str_labels[0:5])

    def manipulate_data(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        vocab = self.tokenizer.get_vocab()
        self.word2idx = {w: i for i, w in enumerate(vocab)}
        self.idx2word = {i:w for w,i in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)

        token_ids = []
        num_seqs = []
        for text in self.str_questions:  # iterate over the list of text
          text_seqs = self.tokenizer.tokenize(str(text))  # tokenize each text individually
          # Convert tokens to IDs
          token_ids = self.tokenizer.convert_tokens_to_ids(text_seqs)
          # Convert token IDs to a tensor of indices using your word2idx mapping
          seq_tensor = torch.LongTensor(token_ids)
          num_seqs.append(seq_tensor)  # append the tensor for each sequence

        # Pad the sequences and create a tensor
        if num_seqs:
          self.numeral_data = pad_sequence(num_seqs, batch_first=True)  # Pads to max length of the sequences
          self.num_sentences, self.maxlen = self.numeral_data.shape

    def train_valid_test_split(self, train_ratio=0.8, test_ratio = 0.1):
        train_size = int(self.num_sentences*train_ratio) +1
        test_size = int(self.num_sentences*test_ratio) +1
        valid_size = self.num_sentences - (train_size + test_size)
        data_indices = list(range(self.num_sentences))
        random.shuffle(data_indices)
        self.train_str_questions = [self.str_questions[i] for i in data_indices[:train_size]]
        self.train_numeral_labels = self.numeral_labels[data_indices[:train_size]]
        train_set_data = self.numeral_data[data_indices[:train_size]]
        train_set_labels = self.numeral_labels[data_indices[:train_size]]
        train_set_labels = torch.from_numpy(train_set_labels)
        train_set = torch.utils.data.TensorDataset(train_set_data, train_set_labels)
        self.test_str_questions = [self.str_questions[i] for i in data_indices[-test_size:]]
        self.test_numeral_labels = self.numeral_labels[data_indices[-test_size:]]
        test_set_data = self.numeral_data[data_indices[-test_size:]]
        test_set_labels = self.numeral_labels[data_indices[-test_size:]]
        test_set_labels = torch.from_numpy(test_set_labels)
        test_set = torch.utils.data.TensorDataset(test_set_data, test_set_labels)
        self.valid_str_questions = [self.str_questions[i] for i in data_indices[train_size:-test_size]]
        self.valid_numeral_labels = self.numeral_labels[data_indices[train_size:-test_size]]
        valid_set_data = self.numeral_data[data_indices[train_size:-test_size]]
        valid_set_labels = self.numeral_labels[data_indices[train_size:-test_size]]
        valid_set_labels = torch.from_numpy(valid_set_labels)
        valid_set = torch.utils.data.TensorDataset(valid_set_data, valid_set_labels)
        self.train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
        self.test_loader = DataLoader(test_set, batch_size=64, shuffle=False)
        self.valid_loader = DataLoader(valid_set, batch_size=64, shuffle=False)

### Load and Process the Data

In [51]:
print('Loading data...')
# Try to download from the official URL, but if it doesn't exist, we'll use the local file
#try:
#    DataManager.maybe_download("data", "train_2000.label", "http://cogcomp.org/Data/QA/QC/")
#except:
#    print("Using local file...")

dm = DataManager()
# Use the practice file provided in the /final directory
dm.read_data("/content/", ["train_2000.label"])

Loading data...

Sample questions and corresponding labels... 

['manner how did serfdom develop in and then leave russia ?', 'cremat what films featured the character popeye doyle ?', "manner how can i find a list of celebrities ' real names ?", 'animal what fowl grabs the spotlight after the chinese year of the monkey ?', 'exp what is the full form of .com ?']
['DESC', 'ENTY', 'DESC', 'ENTY', 'ABBR']


In [52]:
dm.manipulate_data()
dm.train_valid_test_split(train_ratio=0.8, test_ratio = 0.1)

In [53]:
for x, y in dm.train_loader:
    print("Input batch shape:", x.shape)
    print("Label batch shape:", y.shape)
    break

Input batch shape: torch.Size([64, 36])
Label batch shape: torch.Size([64])


# Section 3: Using Word2Vec to Transform Texts to Vectors

### Import Required Libraries

In [54]:
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np

## 3.1 Download Word2Vec Model and Implement get_word_vector Function

In [55]:
# Download the pretrained Word2Vec model (glove-wiki-gigaword-100)
print("Downloading Word2Vec model... This may take a few minutes.")
word2vec_model = api.load('glove-wiki-gigaword-100')
print("Model downloaded successfully!")

Downloading Word2Vec model... This may take a few minutes.
Model downloaded successfully!


In [56]:
def get_word_vector(word, model):
    """
    Get the word vector for a given word using the pretrained Word2Vec model.
    Returns a zero vector if the word is not in the vocabulary.

    Args:
        word (str): The word to get the vector for
        model: The pretrained Word2Vec model

    Returns:
        numpy.ndarray: 100-dimensional word vector
    """
    try:
        vector = model[word]  # Get the word vector from the model
    except:
        vector = np.zeros(100)  # Return zero vector if word not in vocabulary
    return vector

# Test the function
test_word = "computer"
test_vector = get_word_vector(test_word, word2vec_model)
print(f"Vector for '{test_word}': {test_vector[:10]}...")  # Print first 10 dimensions
print(f"Vector shape: {test_vector.shape}")

Vector for 'computer': [-0.16298   0.30141   0.57978   0.066548  0.45835  -0.15329   0.43258
 -0.89215   0.57747   0.36375 ]...
Vector shape: (100,)


## 3.2 Implement get_sentence_vector Function

In [57]:
def get_sentence_vector(sentence, important_score=None, model=None):
    """
    Transform a sentence to a 100-dimensional vector using the pretrained Word2Vec model.

    Args:
        sentence (str): The sentence to transform
        important_score (list): List of importance scores for each word in the sentence
        model: The pretrained Word2Vec model

    Returns:
        numpy.ndarray: 100-dimensional sentence vector
    """
    # Tokenize the sentence (split by spaces and clean)
    words = sentence.lower().split()

    # If no importance scores provided, use equal weights (average)
    if important_score is None:
        important_score = [1.0] * len(words)

    # Make sure important_score has the same length as words
    if len(important_score) != len(words):
        important_score = [1.0] * len(words)

    # Apply softmax to get importance weights
    important_score = np.array(important_score)
    exp_scores = np.exp(important_score - np.max(important_score))  # Subtract max for numerical stability
    important_weight = exp_scores / np.sum(exp_scores)

    # Get word vectors and compute weighted sum
    feature_vector = np.zeros(100)
    for i, word in enumerate(words):
        word_vec = get_word_vector(word, model)
        feature_vector += important_weight[i] * word_vec

    return feature_vector

# Test the function
test_sentence = "What is machine learning?"
test_sent_vector = get_sentence_vector(test_sentence, model=word2vec_model)
print(f"Sentence: '{test_sentence}'")
print(f"Sentence vector shape: {test_sent_vector.shape}")
print(f"First 10 dimensions: {test_sent_vector[:10]}")

Sentence: 'What is machine learning?'
Sentence vector shape: (100,)
First 10 dimensions: [-0.33702249  0.32326     0.4157875  -0.25846751 -0.1422075   0.1532895
  0.16256901  0.17388548  0.207424    0.04120501]


## 3.3 Transform Training Questions to Feature Vectors

In [58]:
print("Transform training set to feature vectors...")

# Transform training questions to feature vectors
X_train = []
for question in dm.train_str_questions:
    # Create decaying importance scores: 1.0, 0.9, 0.81, 0.729, ...
    words = question.split()
    decay_rate = 0.9
    important_score = [decay_rate ** i for i in range(len(words))]

    # Get the sentence vector
    sent_vector = get_sentence_vector(question, important_score=important_score, model=word2vec_model)
    X_train.append(sent_vector)

X_train = np.array(X_train)
y_train = dm.train_numeral_labels

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

Transform training set to feature vectors...
X_train shape: (1601, 100)
y_train shape: (1601,)


In [59]:
print("Transform validation set to feature vectors...")

# Transform validation questions to feature vectors
X_valid = []
for question in dm.valid_str_questions:
    # Create decaying importance scores: 1.0, 0.9, 0.81, 0.729, ...
    words = question.split()
    decay_rate = 0.9
    important_score = [decay_rate ** i for i in range(len(words))]

    # Get the sentence vector
    sent_vector = get_sentence_vector(question, important_score=important_score, model=word2vec_model)
    X_valid.append(sent_vector)

X_valid = np.array(X_valid)
y_valid = dm.valid_numeral_labels

print(f"X_valid shape: {X_valid.shape}")
print(f"y_valid shape: {y_valid.shape}")

Transform validation set to feature vectors...
X_valid shape: (198, 100)
y_valid shape: (198,)


## 3.4 Scale Features Using MinMaxScaler

In [60]:
# Initialize the MinMaxScaler with feature range (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))

# Fit the scaler on training data and transform both training and validation sets
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

print("Scaling completed!")
print(f"X_train_scaled range: [{X_train_scaled.min():.3f}, {X_train_scaled.max():.3f}]")
print(f"X_valid_scaled range: [{X_valid_scaled.min():.3f}, {X_valid_scaled.max():.3f}]")

Scaling completed!
X_train_scaled range: [-1.000, 1.000]
X_valid_scaled range: [-1.218, 1.251]


## 3.5 Train Logistic Regression and Evaluate

In [61]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
print("Training Logistic Regression model...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
print("Training completed!")

# Make predictions on training and validation sets
y_train_pred = lr_model.predict(X_train_scaled)
y_valid_pred = lr_model.predict(X_valid_scaled)

Training Logistic Regression model...
Training completed!


In [62]:
from sklearn import metrics

# Evaluate on training set
train_accuracy = metrics.accuracy_score(y_train, y_train_pred)
print(f"\nTraining Set Performance:")
print(f"Accuracy: {train_accuracy:.4f}")

# Evaluate on validation set
valid_accuracy = metrics.accuracy_score(y_valid, y_valid_pred)
print(f"\nValidation Set Performance:")
print(f"Accuracy: {valid_accuracy:.4f}")

# Display classification report for validation set
print("\nValidation Set Classification Report:")
print(metrics.classification_report(y_valid, y_valid_pred, target_names=dm.str_classes))

# Display confusion matrix
print("\nValidation Set Confusion Matrix:")
print(metrics.confusion_matrix(y_valid, y_valid_pred))


Training Set Performance:
Accuracy: 0.9613

Validation Set Performance:
Accuracy: 0.9040

Validation Set Classification Report:
              precision    recall  f1-score   support

        ABBR       1.00      0.50      0.67         2
        DESC       0.91      0.86      0.89        36
        ENTY       0.89      0.87      0.88        54
         HUM       0.96      1.00      0.98        52
         LOC       0.84      0.90      0.87        29
         NUM       0.88      0.88      0.88        25

    accuracy                           0.90       198
   macro avg       0.91      0.83      0.86       198
weighted avg       0.90      0.90      0.90       198


Validation Set Confusion Matrix:
[[ 1  1  0  0  0  0]
 [ 0 31  2  1  1  1]
 [ 0  2 47  1  2  2]
 [ 0  0  0 52  0  0]
 [ 0  0  3  0 26  0]
 [ 0  0  1  0  2 22]]


## 3.6 BaseTrainer Class Implementation

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BaseTrainer:
    def __init__(self, model, criterion, optimizer, train_loader, val_loader):
        self.model = model
        self.criterion = criterion  # the loss function
        self.optimizer = optimizer  # the optimizer
        self.train_loader = train_loader  # the train loader
        self.val_loader = val_loader  # the valid loader

    # the function to train the model in many epochs
    def fit(self, num_epochs):
        self.num_batches = len(self.train_loader)

        for epoch in range(num_epochs):
            print(f'Epoch {epoch + 1}/{num_epochs}')
            train_loss, train_accuracy = self.train_one_epoch()
            val_loss, val_accuracy = self.validate_one_epoch()
            print(
                f'{self.num_batches}/{self.num_batches} - train_loss: {train_loss:.4f} - train_accuracy: {train_accuracy*100:.4f}% \
                - val_loss: {val_loss:.4f} - val_accuracy: {val_accuracy*100:.4f}%')

    # train in one epoch, return the train_acc, train_loss
    def train_one_epoch(self):
        self.model.train()
        running_loss, correct, total = 0.0, 0, 0
        for i, data in enumerate(self.train_loader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        train_accuracy = correct / total
        train_loss = running_loss / self.num_batches
        return train_loss, train_accuracy

    # evaluate on a loader and return the loss and accuracy
    def evaluate(self, loader):
        self.model.eval()
        loss, correct, total = 0.0, 0, 0
        with torch.no_grad():
            for data in loader:
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = self.model(inputs)
                batch_loss = self.criterion(outputs, labels)
                loss += batch_loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        loss = loss / len(loader)
        return loss, accuracy

    # return the val_acc, val_loss, be called at the end of each epoch
    def validate_one_epoch(self):
      val_loss, val_accuracy = self.evaluate(self.val_loader)
      return val_loss, val_accuracy

print("BaseTrainer class defined successfully!")

BaseTrainer class defined successfully!


# Section 4: Text CNN for Sequence Modeling and Neural Embedding

## TextCNN Model Implementation

In [64]:
import torch.nn as nn
import torch.nn.functional as F

class TextCNN(torch.nn.Module):
    def __init__(self, embed_size=128, state_size=16, data_manager=None):
        super().__init__()
        self.data_manager = data_manager
        self.embed_size = embed_size
        self.state_size = state_size

        # Declare the necessary layers
        self.embed = nn.Embedding(self.data_manager.vocab_size, self.embed_size)

        # Three Conv1D layers with different kernel sizes (3, 5, 7)
        # Input channels = embed_size, Output channels = state_size
        self.conv1d_1 = nn.Conv1d(in_channels=self.embed_size,
                                   out_channels=self.state_size,
                                   kernel_size=3,
                                   padding=1)

        self.conv1d_2 = nn.Conv1d(in_channels=self.embed_size,
                                   out_channels=self.state_size,
                                   kernel_size=5,
                                   padding=2)

        self.conv1d_3 = nn.Conv1d(in_channels=self.embed_size,
                                   out_channels=self.state_size,
                                   kernel_size=7,
                                   padding=3)

        # Fully connected layer for classification
        self.fc = nn.Linear(state_size * 3, self.data_manager.num_classes)

    def forward(self, x):
        # Embedding layer: [batch_size, seq_len] -> [batch_size, seq_len, embed_size]
        e = self.embed(x)

        # Permute for Conv1D: [batch_size, seq_len, embed_size] -> [batch_size, embed_size, seq_len]
        e = e.permute(0, 2, 1)

        # Apply Conv1D with ReLU activation
        # Each convolution produces: [batch_size, state_size, seq_len]
        h1 = F.relu(self.conv1d_1(e))
        h2 = F.relu(self.conv1d_2(e))
        h3 = F.relu(self.conv1d_3(e))

        # Apply GlobalMaxPool1D over the sequence dimension
        # Output: [batch_size, state_size]
        h1 = F.max_pool1d(h1, kernel_size=h1.size(2)).squeeze(2)
        h2 = F.max_pool1d(h2, kernel_size=h2.size(2)).squeeze(2)
        h3 = F.max_pool1d(h3, kernel_size=h3.size(2)).squeeze(2)

        # Concatenate along the feature dimension
        # Output: [batch_size, state_size * 3]
        h = torch.cat([h1, h2, h3], dim=1)

        # Fully connected layer for classification
        h = self.fc(h)
        return h

print("TextCNN model defined successfully!")

TextCNN model defined successfully!


## 4.1 Train TextCNN Model

In [65]:
# Initialize the TextCNN model
text_cnn = TextCNN(data_manager=dm).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(text_cnn.parameters(), lr=0.001)

# Create trainer and train the model
trainer = BaseTrainer(model=text_cnn,
                      criterion=criterion,
                      optimizer=optimizer,
                      train_loader=dm.train_loader,
                      val_loader=dm.valid_loader)

print(f"\nTraining TextCNN model on {device}...")
print(f"Model parameters: {sum(p.numel() for p in text_cnn.parameters())}")
trainer.fit(num_epochs=50)


Training TextCNN model on cuda...
Model parameters: 3937878
Epoch 1/50
26/26 - train_loss: 1.5324 - train_accuracy: 41.4116%                 - val_loss: 1.0089 - val_accuracy: 74.2424%
Epoch 2/50
26/26 - train_loss: 0.6891 - train_accuracy: 84.0725%                 - val_loss: 0.5240 - val_accuracy: 83.3333%
Epoch 3/50
26/26 - train_loss: 0.3977 - train_accuracy: 92.0050%                 - val_loss: 0.3381 - val_accuracy: 92.4242%
Epoch 4/50
26/26 - train_loss: 0.2177 - train_accuracy: 95.5653%                 - val_loss: 0.2813 - val_accuracy: 92.4242%
Epoch 5/50
26/26 - train_loss: 0.1460 - train_accuracy: 97.6889%                 - val_loss: 0.2306 - val_accuracy: 93.4343%
Epoch 6/50
26/26 - train_loss: 0.1000 - train_accuracy: 98.8132%                 - val_loss: 0.2069 - val_accuracy: 93.4343%
Epoch 7/50
26/26 - train_loss: 0.0852 - train_accuracy: 99.6877%                 - val_loss: 0.1800 - val_accuracy: 93.4343%
Epoch 8/50
26/26 - train_loss: 0.0720 - train_accuracy: 98.6883%

## 4.2 Evaluate TextCNN on Test Set

In [66]:
# Evaluate the trained model on the testing set
test_loss, test_acc = trainer.evaluate(dm.test_loader)
print(f'\nFinal Test Set Performance:')
print(f'test_loss: {test_loss:.4f} - test_accuracy: {test_acc*100:.4f}%')


Final Test Set Performance:
test_loss: 0.1846 - test_accuracy: 97.0149%
