In [6]:
# mount google drive
from google.colab import drive
import os
drive.mount('/content/drive')


notebook_dir = '/content/drive/My Drive/Lighthouse Labs/LLM_Project_Sentiment_Analysis'
os.chdir(notebook_dir)

Mounted at /content/drive


In [2]:
!pip install transformers datasets torch accelerate evaluate wandb

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [3]:
# import required module
from datasets import load_dataset

# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.stem import WordNetLemmatizer
from transformers import pipelines
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
import torch
import evaluate
from datasets import load_dataset


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [4]:
# funcitons

# function to clean text - removes punctuation, converts to lowercase, removes stop words
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = text.replace("\n", " ")  # Remove newlines

    # Remove stop words and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

# Define text preprocessing function for batches
def preprocess_text_pretrained(batch):
    cleaned_texts = []
    for text in batch["text"]:  # Loop through each text in the batch
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"\d+", "", text)  # Remove numbers
        text = text.replace("\n", " ")  # Remove newlines
        cleaned_texts.append(text)
    return {"text": cleaned_texts}  # Return a dictionary

# Tokenization function
def tokenize_function(examples):
    examples["text"] = preprocess_text(examples["text"])  # Clean text
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Function to preprocess & tokenize each dataset
def preprocess_and_tokenize(example):
    cleaned_text = preprocess_text(example["text"])  # Preprocess text
    tokenized_output = tokenizer(cleaned_text, padding="max_length", truncation=True)  # Tokenize

    return {
        "input_ids": tokenized_output["input_ids"],
        "attention_mask": tokenized_output["attention_mask"],
        "label": example["label"]
    }

# Function to compute accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)  # Convert logits to class labels
    return accuracy.compute(predictions=predictions, references=labels)

In [16]:
# Logistic Regression model

from datasets import load_from_disk

# Load preprocessed datasets
ds_train_processed = load_from_disk("processed_train_dataset")
ds_test_processed = load_from_disk("processed_test_dataset")

# number of documents
num_documents_train = len(ds_train_processed)
print(f"Number of documents in train dataset: {num_documents_train}") #25000
num_documents_test = len(ds_test_processed)
print(f"Number of documents in test dataset: {num_documents_test}") #25000

# initialize and fit TF_IDF vectorize on training data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(ds_train_processed['text'])
X_test_tfidf = tfidf_vectorizer.transform(ds_test_processed['text'])

# vocabulary size
vocab_size = len(tfidf_vectorizer.get_feature_names_out())
print(f"Total number of unique words in vocabulary: {vocab_size}") #108987

# prepare target variables
y_train = ds_train_processed['label']
y_test = ds_test_processed['label']

# train Logistic Regression model

classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

# predict on test set
y_pred = classifier.predict(X_test_tfidf)

# evaluate the model
print(classification_report(y_test, y_pred))



Number of documents in train dataset: 25000
Number of documents in test dataset: 25000
Total number of unique words in vocabulary: 108987
              precision    recall  f1-score   support

           0       0.94      0.93      0.93     12500
           1       0.93      0.94      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000



Reflection Questions: How well does this model perform? What are some of the limitations you can see from this model you have created?

In [None]:
# Class 0: Represents negative sentiment reviews.
# Class 1: Represents positive sentiment reviews.
# Support: The number of actual occurrences of each class (12,500 for both positive and negative reviews).
# Accuracy: 93% — which indicates the model correctly classifies 93% of the test reviews.

# limitations
# Lack of Deep Context Understanding: Logistic Regression with TF-IDF ignores word order and semantic context

# Vocabulary Size & Sparsity: The model has a large vocabulary (108,987 unique words), which can lead to:

# High-dimensional sparse feature vectors.

# No Handling of Out-of-Vocabulary Words: Any words in the test set that weren’t seen during training are ignored.

# Doesn't Leverage Pretrained Knowledge: Unlike transformer models, Logistic Regression doesn’t benefit from pretraining on large text corpora.
# This limits its ability to generalize nuanced language use or sarcasm.