In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
import os

# Load dataset
train = pd.read_csv('data/processed_train.csv')
test = pd.read_csv('data/processed_test.csv')

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['sentiment'])
y_test = label_encoder.transform(test['sentiment'])

# Vectorizer functions
def tfidf_vectorizer():
    vec = TfidfVectorizer(max_features=10000)
    x_train = vec.fit_transform(train['text']).toarray()
    x_test = vec.transform(test['text']).toarray()
    return x_train, x_test

def bog_vectorizer():
    vec = CountVectorizer(max_features=10000)
    x_train = vec.fit_transform(train['text']).toarray()
    x_test = vec.transform(test['text']).toarray()
    return x_train, x_test

def binary_vectorizer():
    vec = CountVectorizer(binary=True, max_features=10000)
    x_train = vec.fit_transform(train['text']).toarray()
    x_test = vec.transform(test['text']).toarray()
    return x_train, x_test

# Model definition
class SentimentMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=100, output_dim=3):
        super(SentimentMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        return self.fc2(x)


In [None]:

# Training and evaluation function
def train_and_evaluate(x_train, x_test, y_train, y_test, epochs):
    results = []
    X_train_tensor = torch.tensor(x_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_test_tensor = torch.tensor(x_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    for epoch_count in epochs:
        model = SentimentMLP(input_dim=x_train.shape[1])
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        model.train()
        for epoch in tqdm(range(epoch_count), desc=f"Training {epoch_count} epochs"):
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            preds = model(X_test_tensor)
            predicted = torch.argmax(preds, dim=1).numpy()
            precision = precision_score(y_test, predicted, average='macro')
            recall = recall_score(y_test, predicted, average='macro')
            f1 = f1_score(y_test, predicted, average='macro')
            results.append((epoch_count, precision, recall, f1))
    return results

# Run for each vectorizer
vectorizer_functions = {
    'TFIDF': tfidf_vectorizer,
    'BoG': bog_vectorizer,
    'Binary': binary_vectorizer
}
epoch_settings = [5, 10, 20, 50]
all_results = []

for name, func in tqdm(vectorizer_functions.items(), desc="Vectorizers"):
    x_train_vec, x_test_vec = func()
    results = train_and_evaluate(x_train_vec, x_test_vec, y_train, y_test, epoch_settings)
    for epoch, precision, recall, f1 in results:
        all_results.append({
            'Vectorizer': name,
            'Epochs': epoch,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })

# Save to CSV
df_results = pd.DataFrame(all_results)
os.makedirs("outputs", exist_ok=True)
df_results.to_csv("outputs/nn.csv", index=False)


inference

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm

def inference(text_list, model_weights_path="weights/binary_mlp.pt"):
    # Recreate vectorizer and fit again (same as training time)
    train = pd.read_csv('data/processed_train.csv')
    label_encoder = LabelEncoder().fit(train['sentiment'])
    vectorizer = CountVectorizer(binary=True, max_features=10000)
    vectorizer.fit(train['text'])

    x_input = vectorizer.transform(text_list).toarray()
    x_tensor = torch.tensor(x_input, dtype=torch.float32)

    # Load model and weights
    model = SentimentMLP(input_dim=x_input.shape[1])
    model.load_state_dict(torch.load(model_weights_path))
    model.eval()

    with torch.no_grad():
        outputs = model(x_tensor)
        preds = torch.argmax(outputs, dim=1).numpy()
    return label_encoder.inverse_transform(preds)

if __name__ == "__main__":

    examples = ["I love this movie!", "Worst experience ever."]
    predictions = inference(examples)
    for text, pred in zip(examples, predictions):
        print(f"Text: {text}\nPredicted Sentiment: {pred}\n")
