<a href="https://colab.research.google.com/github/ggesa432/ggesa432/blob/master/deeplearning_final_fall23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CRSAS: Consolidated Recommendation and Sentiment Analysis System
### Team Members: Ansh Bhatnagar, Yams Gupta, Zeren Gesang

### Project Idea: Develop a sophisticated recommendation and sentiment analysis system using deep learning models like RoBERTa, focusing on user-generated content from platforms like Yelp.


In [1]:
#Step 1: Import Necessary Libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
#Step 2: Load the 20 Newsgroups Dataset

from sklearn.datasets import fetch_20newsgroups

# For simplicity, let's use only a few categories
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

#Preprocess the data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

y_train = newsgroups_train.target
y_test = newsgroups_test.target


# Naive Bayes Classifier (as a Baseline):

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)
y_pred_nb = clf_nb.predict(X_test)
print(classification_report(y_test, y_pred_nb))


              precision    recall  f1-score   support

           0       0.97      0.60      0.74       319
           1       0.96      0.89      0.92       389
           2       0.97      0.81      0.88       396
           3       0.65      0.99      0.78       398

    accuracy                           0.83      1502
   macro avg       0.89      0.82      0.83      1502
weighted avg       0.88      0.83      0.84      1502



# Step 3: Implementing RoBERTa Model

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW

class NewsgroupsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = NewsgroupsDataset(newsgroups_train.data, newsgroups_train.target, tokenizer)
test_dataset = NewsgroupsDataset(newsgroups_test.data, newsgroups_test.target, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(newsgroups_train.target_names))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Training setup goes here
# Note: Training a transformer model on a CPU can be very slow.

# Set up GPU/CPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Hyperparameters
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * epochs

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Backward pass
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()

        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    # Calculate average loss over the training data
    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1} / {epochs}, Average Training Loss: {avg_train_loss}")

# Save the model
model.save_pretrained('./roberta_newsgroups_model')








Epoch 1 / 3, Average Training Loss: 0.4578089004113588
Epoch 2 / 3, Average Training Loss: 0.12259184501238797
Epoch 3 / 3, Average Training Loss: 0.07601271564645928


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from collections import Counter

class NewsgroupsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.vocab = vocab
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        # Tokenize and encode the text
        encoding = self.tokenizer(text)  # No need to pass vocab here, as it's included in the lambda
        encoding = encoding[:self.max_len]  # Truncate to max_length
        padding_length = self.max_len - len(encoding)
        encoding += [self.vocab.get("<PAD>")] * padding_length  # Pad with <PAD> token ID

        return {
            'input_ids': torch.tensor(encoding, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }


class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        # Take the output of the last time step
        last_time_step_out = lstm_out[:, -1, :]
        out = self.fc(last_time_step_out)
        return out

def build_vocab(texts, min_freq=1):
    # Tokenize the texts and count word frequencies
    tokenized_texts = [text.split() for text in texts]
    word_freq = Counter(word for tokens in tokenized_texts for word in tokens)

    # Build the vocabulary
    vocab = {"<PAD>": 0, "<UNK>": 1}
    idx = 2
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

# Build the vocabulary from your training texts
vocab = build_vocab([text for text in newsgroups_train.data])


def basic_tokenizer(text, vocab):
    return [vocab.get(word, vocab["<UNK>"]) for word in text.split()]


max_length = 128  # Define a suitable maximum sequence length
train_dataset = NewsgroupsDataset(newsgroups_train.data, newsgroups_train.target, lambda text: basic_tokenizer(text, vocab), max_length)
test_dataset = NewsgroupsDataset(newsgroups_test.data, newsgroups_test.target, lambda text: basic_tokenizer(text, vocab), max_length)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)







In [7]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 256
vocab_size = len(vocab)
num_labels = len(newsgroups_train.target_names)

model = LSTMClassifier(embedding_dim, hidden_dim, vocab_size, num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




LSTMClassifier(
  (embedding): Embedding(78700, 128)
  (lstm): LSTM(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)

In [8]:

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (Adam as the common choice)
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Number of training epochs
num_epochs = 5

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in train_loader:
        # Move batch data to the device
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")


# Save the model
torch.save(model.state_dict(), 'lstm_model.pth')





Epoch 1/5, Loss: 1.3544090167737344
Epoch 2/5, Loss: 1.2387442391523174
Epoch 3/5, Loss: 1.0442960211928463
Epoch 4/5, Loss: 0.7566560421821097
Epoch 5/5, Loss: 0.525377191917997
