---

# **1. Imports, Loading the Datasets, Initializing**

In [1]:
!pip install contractions

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import re
import nltk
import html
import emoji
import contractions
import random
import gensim.downloader as api
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, TensorDataset
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load the Word2Vec model
w2v = api.load("word2vec-google-news-300")  

# Load the data
train_data      = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/train_dataset.csv")
validation_data = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/val_dataset.csv")
test_data       = pd.read_csv("/kaggle/input/ai-2-dl-for-nlp-2025-homework-2/test_dataset.csv")

# FOR TESTING
train_data.head()

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/1

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




Unnamed: 0,ID,Text,Label
0,189385,@whoisralphie dude I'm so bummed ur leaving!,0
1,58036,"oh my god, a severed foot was foun in a wheely...",0
2,190139,I end up &quot;dog dialing&quot; sumtimes. Wha...,1
3,99313,@_rachelx meeeee toooooo!,0
4,157825,"I was hoping I could stay home and work today,...",0


---

# **2. Text Preprocessing / Data cleaning**


The `preprocess_text` function cleans and preprocess the tweets.

In [2]:

def preprocess_text(text):
    
    # 0. Drop empty tweets
    if not text or not text.strip():
        return None

    # 1. Lowercase
    text = text.lower()

    # 2. Expand contractions
    text = contractions.fix(text)

    # 3. Tag hashtags as both token and marker
    text = re.sub(r"#(\w+)", r"<HASHTAG_\1> \1", text)

    # 4. Anonymize emails
    text = re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}","<EMAIL>",text)

    # 5. Anonymize phone numbers
    text = re.sub(r"\+?\d[\d\-\s]{7,}\d","<PHONE>",text)
    
    # 6. Tokenize into words
    tokens = word_tokenize(text)

    # 7. Drop empty tokens
    tokens = [tok for tok in tokens if tok]

    # 8. Lemmatize each token
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]

    # 9. OOV handling
    tokens = [tok if tok in w2v.key_to_index else "<UNK>" for tok in tokens]

    return tokens


**Greedy Selection Funcion**
A greedy pipeline builder that:

1. Starts with a minimal preprocessing setup.
2. Adds one preprocessing step at a time.
3. After each addition, runs a quick training/validation check.
4. Keeps the new step only if it improves the validation score.
5. Repeats until no further improvements are found.


# **3. Classes and Functions Definitions**


**Define all the classes and functions needed for the next steps**

A two-layer `PyTorch` model that:

1. Maps the 300-dimention input to a hidden layer.
2. Applies ReLU activation and dropout.
3. Passes through a second hidden layer with ReLU and dropout.
4. Outputs a single logit for sentiment prediction.


In [3]:
class SimpleSentimentModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, dropout=0.0):
        super(SimpleSentimentModel, self).__init__()
        
        # First layer that connects inputs to hidden layer
        self.input_to_hidden = nn.Linear(input_size, hidden_size) 
        
        # ReLU activation to makw the model learn better
        self.relu = nn.ReLU()
        
        # Dropout after first hidden layer
        self.dropout1 = nn.Dropout(dropout)

        
        # Second hidden layer (connects the first hidden layer to the second)
        self.hidden_to_hidden = nn.Linear(hidden_size, hidden_size)
        
        # Dropout after second hidden layer
        self.dropout2 = nn.Dropout(dropout)
        
        # Output layer
        self.hidden_to_output = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.input_to_hidden(x)  # First layer
        x = self.relu(x)             # Activation function
        x = self.dropout1(x)         # Dropout
        
        x = self.hidden_to_hidden(x) # Second layer
        x = self.relu(x)             # Activation function again
        x = self.dropout2(x)         # Dropout
        
        return self.hidden_to_output(x)


A PyTorch `Dataset` that:

1. Stores tweet feature vectors and labels as tensors.
2. `__len__`: Returns the number of samples.
3. `__getitem__`: Retrieves the `(features, label)` pair by index.


In [4]:
class TweetDataset(Dataset):
    def __init__(self, tweets, labels):
        
        # Convert tweet vectors to tensor
        self.tweets = torch.tensor(tweets, dtype=torch.float32)  
         # Convert labels (0 or 1) to tensor
        self.labels = torch.tensor(labels, dtype=torch.float32) 

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        return self.tweets[idx], self.labels[idx]


A function to convert each tweet into the average of its word vectors


In [5]:
def vectorize_tweet(tokens, w2v_model, dim=300):
    vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


A helper function that takes lists of training losses, validation losses, and validation accuracies, then:

1. Plots and saves the training vs. validation loss curve.
2. Plots and saves the validation accuracy curve.

The `prefix` argument customizes the plot titles and filenames.  


In [6]:

def plot_learning_curves(train_losses, val_losses, val_accs, prefix="run"):

    epochs = len(train_losses)
    x = range(1, epochs + 1)

    # Loss curve
    plt.figure()
    plt.plot(x, train_losses, label="Train Loss")
    plt.plot(x, val_losses,   label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"{prefix} Learning Curve – Loss")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"{prefix.lower()}_loss_curve.png")
    plt.show()

    # Accuracy curve
    plt.figure()
    plt.plot(x, val_accs, label="Val Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy (%)")
    plt.title(f"{prefix} Learning Curve – Accuracy")
    plt.tight_layout()
    plt.savefig(f"{prefix.lower()}_accuracy_curve.png")
    plt.show()


Greedy Selection Function used for testing

Commented out for sumbission

In [7]:

# from textblob import TextBlob

# def spell_correct(text):
#     try:
#         corrected = str(TextBlob(text).correct())
#     except Exception:
#         corrected = text  
#     return corrected

# english_vocab = set(words.words())

# def is_english_word(word):
#     return word.lower() in english_vocab

# # Define each preprocessing step as a function
# def drop_empty(text):
#     return text if text and text.strip() else None

# def lowercase(text):
#     return text.lower()

# def expand_contractions(text):
#     return contractions.fix(text)

# def html_unescape_step(text):
#     return html.unescape(text)

# def anonymize_url(text):
#     return re.sub(r"https?://\S+", "<URL>", text)

# def anonymize_email(text):
#     return re.sub(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", "<EMAIL>", text)

# def anonymize_phone(text):
#     return re.sub(r"\+?\d[\d\-\s]{7,}\d", "<PHONE>", text)

# def tokenize(text):
#     return word_tokenize(text)

# def remove_empty_tokens(tokens):
#     return [tok for tok in tokens if tok]

# def spell_correct_step(tokens):
#     return [spell_correct(tok) for tok in tokens]

# def lemmatize_step(tokens):
#     return [lemmatizer.lemmatize(tok) for tok in tokens]

# def remove_stopwords_step(tokens):
#     return [tok for tok in tokens if tok not in stop_words]

# def english_filter_step(tokens):
#     return [tok for tok in tokens if is_english_word(tok, threshold=2.0)]

# # Apply each pipeline
# def apply_pipeline(text, pipeline):
#     data = text
#     for step in pipeline:
#         data = step(data)
#         if data is None:
#             return None
#     return data

# # Train 3 epochs and return validation accuracy 
# def run_experiment(pipeline):
    
#     # Preprocess & vectorize the train set 
#     X_train, y_train = [], []
#     for txt, lbl in zip(train_data["Text"], train_data["Label"]):
#         tokens = apply_pipeline(txt, pipeline)
#         if tokens is None: continue
#         X_train.append(vectorize_tweet(tokens, w2v))
#         y_train.append(lbl)
#     X_train, y_train = np.array(X_train), np.array(y_train)
#     train_loader = DataLoader(TweetDataset(X_train, y_train), batch_size=64, shuffle=True)
    
#     # Preprocess & vectorize the validation set 
#     X_val, y_val = [], []
#     for txt, lbl in zip(validation_data["Text"], validation_data["Label"]):
#         tokens = apply_pipeline(txt, pipeline)
#         if tokens is None: continue
#         X_val.append(vectorize_tweet(tokens, w2v))
#         y_val.append(lbl)
#     X_val, y_val = np.array(X_val), np.array(y_val)
#     val_loader = DataLoader(TweetDataset(X_val, y_val), batch_size=64)
    
#     # Set up the model,loss,optimazer
#     model = SimpleSentimentModel(input_size=300, hidden_size=128)
#     criterion = nn.BCEWithLogitsLoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)
    
#     # Train for 3 epochs 
#     model.train()
#     for epoch in range(3):
#         total_loss = 0.0
#         for vecs, labels in train_loader:
#             preds = model(vecs).squeeze()
#             loss  = criterion(preds, labels)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             total_loss += loss.item()
#         print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")
    
#     # Evaluate on validation 
#     model.eval()
#     correct, total = 0, 0
#     with torch.no_grad():
#         for vecs, labels in val_loader:
#             preds = (model(vecs).squeeze() > 0.5).float()
#             correct += (preds == labels).sum().item()
#             total   += len(labels)
#     acc = 100 * correct / total
#     print(f"Validation Accuracy: {acc:.2f}%\n")
#     return acc

# # Greedy selection, add each step one by one and if something increases the score, keep it

# # Start with the minimal pipeline:
# best_pipeline = [drop_empty, lowercase]
# best_acc = run_experiment(best_pipeline)
# print(f"Baseline ({[fn.__name__ for fn in best_pipeline]}): {best_acc:.2f}%\n")

# # Candidate steps 
# candidates = [
#     expand_contractions,
#     html_unescape_step,
#     anonymize_url,
#     anonymize_email,
#     anonymize_phone,
#     tokenize,
#     remove_empty_tokens,
#     spell_correct_step,
#     lemmatize_step,
#     remove_stopwords_step,
#     english_filter_step,
# ]

# # Try adding each step one by one
# for step in candidates:
#     trial = best_pipeline + [step]
#     acc = run_experiment(trial)
#     print(f"Trial + {step.__name__:<20} → {acc:.2f}%")
#     if acc > best_acc:
#         best_acc      = acc
#         best_pipeline = trial
#         print(f"  → Keeping {step.__name__} (new best: {best_acc:.2f}%)\n")

# # Final pipeline
# print("Final pipeline order:")
# for fn in best_pipeline:
#     print(" -", fn.__name__)


# **4. Word Embeddings**


In [8]:
# # Vectorize the train tweets
train_vectors = np.array([vectorize_tweet(preprocess_text(text), w2v) for text in train_data["Text"]])

# Save to use later
np.save("train_vectors.npy", train_vectors)

train_vectors = np.load("train_vectors.npy")

# Vectorize the validation tweets
val_vectors   = np.array([vectorize_tweet(preprocess_text(text), w2v) for text in validation_data["Text"]])
np.save("val_vectors.npy", val_vectors)

val_vectors = np.load("val_vectors.npy")

# Vectorize the test tweets
test_vectors  = np.array([vectorize_tweet(preprocess_text(text), w2v) for text in test_data["Text"]])
np.save("test_vectors.npy", test_vectors)

test_vectors = np.load("test_vectors.npy")

Set the model hyperparameters:
- 300-dimensional input vectors
- a 128-neuron hidden layer
- 20% dropout
Then create an instance of `SimpleSentimentModel` with those settings.

In [9]:
# Default size of the tweet vector 
input_size = 300  
# Default number of neurons in the hidden layer 
hidden_size = 128 
# Default dropout rate
dropout_rate = 0.2

# Define the model
model = SimpleSentimentModel(
    input_size, 
    hidden_size, 
    output_size=1, 
    dropout=dropout_rate
)


Move model to GPU if it is available


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

SimpleSentimentModel(
  (input_to_hidden): Linear(in_features=300, out_features=128, bias=True)
  (relu): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (hidden_to_hidden): Linear(in_features=128, out_features=128, bias=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (hidden_to_output): Linear(in_features=128, out_features=1, bias=True)
)

We use binary cross-entropy with logits as our loss (`BCEWithLogitsLoss`) and the Adam optimizer with a learning rate of 0.001 to train the model.  


In [11]:
# Loss function: Binary Cross-Entropy loss
criterion = nn.BCEWithLogitsLoss()

# Optimizer: Adam
optimizer = optim.Adam(model.parameters(), lr=0.001)


Wrap the processed tweet vectors and their labels in `TweetDataset`, then create `DataLoader`s with a batch size of 64—shuffling only the training loader to mix up examples each epoch.  


In [12]:
train_labels = train_data['Label'].values 
val_labels = validation_data['Label'].values

train_dataset = TweetDataset(train_vectors, train_labels)
val_dataset = TweetDataset(val_vectors, val_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


A combined training and validation loop that, for each epoch:
1. Trains the model on all batches in `train_loader`, tracking the average training loss.
2. Evaluates on `val_loader` without gradient updates, computing validation loss and accuracy.
3. Prints and returns lists of train losses, validation losses, and validation accuracies over all epochs.


In [13]:
# def train_and_evaluate(model, optimizer, criterion, train_loader, val_loader, epochs):
    
#     train_losses, val_losses, val_accs = [], [], []    
    
#     for epoch in range(epochs):
        
#         # Training phase

#         # Set the model to training mode
#         model.train() 
#         running_loss = 0.0
    
#         for texts, labels in train_loader:
#             # Move data to device
#             texts, labels = texts.to(device), labels.to(device)
    
#             # Forward pass: get raw logits
#             logits = model(texts)
    
#             # Calculate loss on logits vs. labels
#             loss = criterion(logits.squeeze(), labels)
    
#             # Backward pass: calculate gradients and update parameters

#             # Clear previous gradients
#             optimizer.zero_grad()  
#             # Calculate new gradients
#             loss.backward()        
#             # Update model parameters
#             optimizer.step()       

#             # Keep track of the loss
#             running_loss += loss.item()  
    
#         # Compute and record average training loss
#         avg_train_loss = running_loss / len(train_loader)
#         train_losses.append(avg_train_loss)
    
#         # Validation phase

#         # Set the model to evaluation mode
#         model.eval() 
#         running_val_loss = 0.0
#         correct, total = 0, 0
    
#         with torch.no_grad():  
#             for texts, labels in val_loader:
#                 # Move data to device
#                 texts, labels = texts.to(device), labels.to(device)
    
#                 # Forward pass: get raw logits
#                 logits = model(texts).squeeze()
    
#                 # Calculate validation loss
#                 running_val_loss += criterion(logits, labels).item()
    
#                 # Classification: sigmoid + threshold
#                 probs = torch.sigmoid(logits)
#                 predicted = (probs > 0.5).float()
    
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()
    
#         # Compute and record average validation loss and accuracy
#         avg_val_loss = running_val_loss / len(val_loader)
#         val_losses.append(avg_val_loss)
    
#         val_acc = 100 * correct / total
#         val_accs.append(val_acc)
    
#         print(
#             f"Epoch {epoch+1}/{epochs}, "
#             f"Train Loss: {avg_train_loss:.4f}, "
#             f"Val Loss: {avg_val_loss:.4f}, "
#             f"Val Acc: {val_acc:.2f}%"
#         )
#     return train_losses, val_losses, val_accs


In [14]:

# # Define the learning_rates ---
# learning_rates = [1e-4, 5e-4, 1e-3]
# base_hparams = {
#     'hidden_size': 128,
#     'dropout':     0.2,
#     'batch_size':  64,
#     # Keep the epochs low for submitting
#     'epochs':      3
# }

# results = []

# # Run the learning_rates sweep 
# for lr in learning_rates:

#     model     = SimpleSentimentModel(
#                     input_size=300,
#                     hidden_size=base_hparams['hidden_size'],
#                     dropout=base_hparams['dropout']
#                ).to(device)
#     optimizer = optim.Adam(model.parameters(), lr=lr)
#     criterion = nn.BCEWithLogitsLoss()

#     # Train and evaluate
#     train_losses, val_losses, val_accs = train_and_evaluate(
#         model, optimizer, criterion,
#         train_loader, val_loader,
#         base_hparams['epochs']
#     )

#     # Plot learning curves for this configuration
#     plot_learning_curves(
#         train_losses, 
#         val_losses, 
#         val_accs, 
#         prefix=f"lr_{lr:.0e}"
#     )

#     # Record final validation accuracy
#     final_acc = val_accs[-1]
#     print(f"LR={lr:.0e} → Final Val Acc = {final_acc:.2f}%")
#     results.append((lr, final_acc))

# # Summarize results 
# print("\nLearning rate sweep results:")
# for lr, acc in results:
#     print(f"  {lr:.0e}: {acc:.2f}%")


In [15]:
# # Define the hidden size parameters ---
# hidden_sizes = [64, 128, 256, 512]
# base_hparams = {
#     'learning_rate': 1e-4,
#     'dropout':       0.2,
#     'batch_size':    64,
#     # Keep the epochs low for submitting
#     'epochs':        3
# }

# results = []

# # Run the hidden size sweep 
# for h in hidden_sizes:

#     model = SimpleSentimentModel(
#         input_size=300,
#         hidden_size=h,
#         dropout=base_hparams['dropout']
#     ).to(device)

#     optimizer = optim.Adam(model.parameters(), lr=base_hparams['learning_rate'])
#     criterion = nn.BCEWithLogitsLoss()

#     # Train and evaluate
#     train_losses, val_losses, val_accs = train_and_evaluate(
#         model, optimizer, criterion,
#         train_loader, val_loader,
#         base_hparams['epochs']
#     )

#     # Plot learning curves for this configuration
#     plot_learning_curves(
#         train_losses,
#         val_losses,
#         val_accs,
#         prefix=f"hidden_{h}"
#     )

#     # Record final validation accuracy
#     final_acc = val_accs[-1]
#     print(f"Hidden={h} → Final Val Acc = {final_acc:.2f}%")
#     results.append((h, final_acc))

# # Summarize results 
# print("\nHidden size sweep results:")
# for h, acc in results:
#     print(f"  Hidden={h}: {acc:.2f}%")


In [16]:
# # Define the dropout rate parameters ---
# dropout_rates = [0.0, 0.2, 0.4, 0.5]
# base_hparams = {
#     'learning_rate': 1e-4,
#     'hidden_size':   64,
#     'batch_size':    64,
#     # Keep the epochs low for submitting
#     'epochs':        3
# }

# results = []

# # Run the dropout sweep 
# for d in dropout_rates:

#     model = SimpleSentimentModel(
#         input_size=300,
#         hidden_size=base_hparams['hidden_size'],
#         dropout=d
#     ).to(device)

#     optimizer = optim.Adam(model.parameters(), lr=base_hparams['learning_rate'])
#     criterion = nn.BCEWithLogitsLoss()

#     # Train and evaluate
#     train_losses, val_losses, val_accs = train_and_evaluate(
#         model, optimizer, criterion,
#         train_loader, val_loader,
#         base_hparams['epochs']
#     )

#     # Plot learning curves for this configuration
#     plot_learning_curves(
#         train_losses,
#         val_losses,
#         val_accs,
#         prefix=f"dropout_{int(d * 100)}"
#     )

#     # Record final validation accuracy
#     final_acc = val_accs[-1]
#     print(f"Dropout={d:.1f} → Final Val Acc = {final_acc:.2f}%")
#     results.append((d, final_acc))

# # Summarize results
# print("\nDropout sweep results:")
# for d, acc in results:
#     print(f"  Dropout={d:.1f}: {acc:.2f}%")


In [17]:
# # Define the batch size parameters ---
# batch_sizes = [32, 64, 128, 256]
# base_hparams = {
#     'learning_rate': 1e-4,
#     'hidden_size':   64,
#     'dropout':       0.2,
#     'epochs':        30
# }

# results = []

# # Run the batch size sweep 
# for batch_size in batch_sizes:
    
#     # Create data loaders with current batch size
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader   = DataLoader(val_dataset, batch_size=batch_size)

#     model = SimpleSentimentModel(
#         input_size=300,
#         hidden_size=base_hparams['hidden_size'],
#         dropout=base_hparams['dropout']
#     ).to(device)

#     optimizer = optim.Adam(model.parameters(), lr=base_hparams['learning_rate'])
#     criterion = nn.BCEWithLogitsLoss()

#     # Train and evaluate
#     train_losses, val_losses, val_accs = train_and_evaluate(
#         model, optimizer, criterion,
#         train_loader, val_loader,
#         base_hparams['epochs']
#     )

#     # Plot learning curves for this configuration
#     plot_learning_curves(
#         train_losses, 
#         val_losses, 
#         val_accs, 
#         prefix=f"batch_{batch_size}"
#     )

#     # Record final validation accuracy
#     final_acc = val_accs[-1]
#     print(f"Batch={batch_size} → Final Val Acc = {final_acc:.2f}%")
#     results.append((batch_size, final_acc))

# # Summarize results 
# print("\nBatch size sweep results:")
# for batch_size, acc in results:
#     print(f"  Batch={batch_size}: {acc:.2f}%")


In [18]:
def train_and_evaluate(model, optimizer, criterion, train_loader, val_loader, epochs, patience):
    
    train_losses, val_losses, val_accs = [], [], []    
    
    # Early‐stopping settings
    best_val_loss = float('inf')
    best_model_state = None
    best_epoch = -1
    epochs_without_improve = 0
    
    for epoch in range(epochs):
        
        # Training phase
        model.train()
        running_loss = 0.0
    
        for texts, labels in train_loader:
            # Move data to device
            texts, labels = texts.to(device), labels.to(device)
    
            # Forward pass: get raw logits
            logits = model(texts)
    
            # Calculate loss on logits vs. labels
            loss = criterion(logits.squeeze(), labels)
    
            # Backward pass: calculate gradients and update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item()
    
        # Compute and record average training loss
        avg_train_loss = running_loss / len(train_loader)
        train_losses.append(avg_train_loss)
    
        # Validation phase
        model.eval()
        running_val_loss = 0.0
        correct, total = 0, 0
    
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                logits = model(texts).squeeze()
                running_val_loss += criterion(logits, labels).item()
    
                probs = torch.sigmoid(logits)
                predicted = (probs > 0.5).float()
    
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
    
        avg_val_loss = running_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
    
        val_acc = 100 * correct / total
        val_accs.append(val_acc)
    
        print(
            f"Epoch {epoch+1}/{epochs}, "
            f"Train Loss: {avg_train_loss:.4f}, "
            f"Val Loss: {avg_val_loss:.4f}, "
            f"Val Acc: {val_acc:.2f}%"
        )
        
        # Early stopping logic
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict()
            best_epoch = epoch + 1
            epochs_without_improve = 0
        else:
            epochs_without_improve += 1
            if epochs_without_improve >= patience:
                print(f"Stopping early at epoch {epoch+1} (no improvement in val loss for {patience} epochs).")
                break
    
    return train_losses, val_losses, val_accs, best_model_state, best_epoch


In [19]:
import torch.nn as nn

class SimpleFFNN(nn.Module):
    def __init__(self, embedding_dim=300, hidden_size=64, dropout=0.2):
        super(SimpleFFNN, self).__init__()

        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, 1)  

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [20]:

# Set random seed for reproducibility 
seed = 42

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model 
model = SimpleFFNN(embedding_dim=300, hidden_size=64, dropout=0.2)
model.to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Train with early stopping 
train_losses, val_losses, val_accs, best_model_state, best_epoch = train_and_evaluate(
    model, optimizer, criterion, train_loader, val_loader,
    epochs=30, patience=3
)

print(f"\nBest validation epoch: {best_epoch}")
model.load_state_dict(best_model_state)  # Restore best weights


Epoch 1/30, Train Loss: 0.6049, Val Loss: 0.5469, Val Acc: 72.80%
Epoch 2/30, Train Loss: 0.5378, Val Loss: 0.5203, Val Acc: 74.21%
Epoch 3/30, Train Loss: 0.5231, Val Loss: 0.5118, Val Acc: 74.75%
Epoch 4/30, Train Loss: 0.5172, Val Loss: 0.5080, Val Acc: 74.91%
Epoch 5/30, Train Loss: 0.5134, Val Loss: 0.5050, Val Acc: 75.08%
Epoch 6/30, Train Loss: 0.5108, Val Loss: 0.5033, Val Acc: 75.18%
Epoch 7/30, Train Loss: 0.5085, Val Loss: 0.5012, Val Acc: 75.29%
Epoch 8/30, Train Loss: 0.5066, Val Loss: 0.4997, Val Acc: 75.40%
Epoch 9/30, Train Loss: 0.5052, Val Loss: 0.4996, Val Acc: 75.48%
Epoch 10/30, Train Loss: 0.5037, Val Loss: 0.4973, Val Acc: 75.60%
Epoch 11/30, Train Loss: 0.5023, Val Loss: 0.4967, Val Acc: 75.68%
Epoch 12/30, Train Loss: 0.5010, Val Loss: 0.4953, Val Acc: 75.75%
Epoch 13/30, Train Loss: 0.5001, Val Loss: 0.4945, Val Acc: 75.81%
Epoch 14/30, Train Loss: 0.4990, Val Loss: 0.4943, Val Acc: 75.85%
Epoch 15/30, Train Loss: 0.4983, Val Loss: 0.4929, Val Acc: 75.93%
Epoc

<All keys matched successfully>

In [21]:
test_inputs = torch.tensor(test_vectors, dtype=torch.float32)
test_dataset = torch.utils.data.TensorDataset(test_inputs)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=256)

# Predict on test set and generate submission.csv
model.eval()
all_preds = []

with torch.no_grad():
    for (texts,) in test_loader: 
        texts = texts.to(device)
        logits = model(texts).squeeze()
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).float().cpu().numpy()
        all_preds.extend(preds)

# Save submission
submission_df = pd.DataFrame({"ID": range(len(all_preds)), "Label": all_preds})
submission_df.to_csv("submission.csv", index=False)
