# M3-L4-Screencasts

## M3-L4-SC1: TF-IDF Vectorization for Sentiment Data

### Step 1: Setting Up Your Workspace
Import necessary libraries and initialize the dataset.

In [None]:
# Import necessary libraries.
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import torch # For dense embeddings
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import userdata
userdata.get('HF_TOKEN')

# Download NLTK resources (run once)
print("Downloading NLTK resources...")
try:
    nltk.download('punkt', quiet=True)
    nltk.download('punkt_tab', quiet=True) # Added this line
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    print("NLTK resources downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    print("Please ensure you have an internet connection or try running nltk.download() manually.")


# Define the data
texts = [
    "This movie is fantastic and I love it.",
    "The acting was great but the plot was weak.",
    "A terrible film, completely boring.",
    "I hated every minute of this production.",
    "Wonderful cinematography and compelling story.",
    "Bad script and poor direction.",
    "An absolute masterpiece, highly recommend.",
    "Utterly disappointing experience.",
    "Enjoyed it thoroughly, a real gem.",
    "Skip this one, it's a waste of time.",
    "The food was delicious and the service was excellent.",
    "Didn't like the atmosphere, too noisy.",
    "A truly unique and enjoyable experience.",
    "The product broke after only a week.",
    "Highly satisfied with my purchase.",
    "The customer support was unhelpful.",
    "What a fantastic performance!",
    "Quite boring, nothing special.",
    "Loved the vibrant colors and design.",
    "The instructions were unclear and confusing.",
    "Would definitely visit again.",
    "Overpriced for what you get.",
    "A pleasant surprise, much better than expected.",
    "Poor quality materials used.",
    "Seamless transaction and fast delivery.",
    "The ending was very predictable.",
    "Outstanding craftsmanship.",
    "Left feeling disappointed.",
    "So happy with the results!",
    "Could have been much better."
]
# Simple sentiment labels (1: positive, 0: negative/neutral)
labels = [
    1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
    1, 0, 1, 0, 1, 0, 1, 0, 1, 0
]

# Create a DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})
print("Original Data:")
print(df)
print(f"\nTotal number of examples: {len(df)}")

# Preprocessing function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)
df['processed_text'] = df['text'].apply(preprocess_text)
print("\nProcessed Data (first 5 rows):")
print(df.head())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.3, random_state=42, stratify=df['label'])

print(f"\nTrain set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Train set label distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test set label distribution:\n{y_test.value_counts(normalize=True)}") # Check if stratification worked

Downloading NLTK resources...
NLTK resources downloaded successfully.
Original Data:
                                                 text  label
0              This movie is fantastic and I love it.      1
1         The acting was great but the plot was weak.      0
2                 A terrible film, completely boring.      0
3            I hated every minute of this production.      0
4      Wonderful cinematography and compelling story.      1
5                      Bad script and poor direction.      0
6          An absolute masterpiece, highly recommend.      1
7                   Utterly disappointing experience.      0
8                  Enjoyed it thoroughly, a real gem.      1
9                Skip this one, it's a waste of time.      0
10  The food was delicious and the service was exc...      1
11             Didn't like the atmosphere, too noisy.      0
12           A truly unique and enjoyable experience.      1
13               The product broke after only a week.      0


### Step 2: Tokenizing the Text
Initialize the `TfidfVectorizer` to automate tokenization and conversion.

In [None]:
# **TF-IDF**
vectorizer_tfidf = TfidfVectorizer()


### Step 3: Fitting and Transforming Sentiment Data
Fit and transform the review data into a TF-IDF matrix.

In [None]:
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)
print("\n--- TF-IDF ---")
print("Vocabulary size:", len(vectorizer_tfidf.vocabulary_))
print("Train TF-IDF matrix shape:", X_train_tfidf.shape)
print("Test TF-IDF matrix shape:", X_test_tfidf.shape)

# Check the TF-IDF matrix
print("TF-IDF Matrix:\n", X_train_tfidf.toarray())


--- TF-IDF ---
Vocabulary size: 66
Train TF-IDF matrix shape: (21, 66)
Test TF-IDF matrix shape: (9, 66)
TF-IDF Matrix:
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.55137204 ... 0.         0.         0.        ]]


### Step 4: Decoding the TF-IDF Features
Display the feature names and their respective weightings.


In [None]:
# Display the feature names
feature_names = vectorizer_tfidf.get_feature_names_out()
print("Feature Names:", feature_names)

# Display the TF-IDF vectors with feature names
import pandas as pd
tfidf_df = pd.DataFrame(X_train_tfidf.toarray(), columns=feature_names)
print("TF-IDF DataFrame:\n", tfidf_df)


Feature Names: ['absolute' 'acting' 'better' 'boring' 'color' 'completely' 'could'
 'craftsmanship' 'customer' 'definitely' 'delivery' 'design'
 'disappointed' 'disappointing' 'ending' 'enjoyed' 'every' 'expected'
 'experience' 'fantastic' 'fast' 'feeling' 'film' 'gem' 'get' 'great'
 'happy' 'hated' 'highly' 'left' 'love' 'loved' 'masterpiece' 'minute'
 'movie' 'much' 'nothing' 'one' 'outstanding' 'overpriced' 'pleasant'
 'plot' 'predictable' 'production' 'purchase' 'quite' 'real' 'recommend'
 'result' 'satisfied' 'seamless' 'skip' 'special' 'support' 'surprise'
 'terrible' 'thoroughly' 'time' 'transaction' 'unhelpful' 'utterly'
 'vibrant' 'visit' 'waste' 'weak' 'would']
TF-IDF DataFrame:
     absolute  acting    better    boring  color  completely     could  \
0   0.000000     0.0  0.000000  0.000000    0.0    0.000000  0.000000   
1   0.000000     0.0  0.000000  0.000000    0.0    0.000000  0.000000   
2   0.000000     0.0  0.000000  0.000000    0.0    0.000000  0.000000   
3   0.000

## M3-L4-SC2: Training and Evaluating a Sentiment Classifier

### Step 1: Training the Sentiment Classifier
Train the sentiment classifier using the training set.

In [None]:
# Initialize and train the classifier
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

### Step 2: Evaluating the Model
Predict and evaluate model performance using accuracy and classification metrics.

In [None]:
# Predict and evaluate the model
y_pred = classifier.predict(X_test_tfidf)

# Print accuracy and classification report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.67
Classification Report:
               precision    recall  f1-score   support

           0       0.62      1.00      0.77         5
           1       1.00      0.25      0.40         4

    accuracy                           0.67         9
   macro avg       0.81      0.62      0.58         9
weighted avg       0.79      0.67      0.61         9



## M3-L4-SC3: Fine-Tuning BERT for Sentiment Analysis with Hugging Face Transformers

### Step 1: Set up the BERT tokenizer

In [None]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Step 2: Preparing Data for BERT
Tokenize the dataset for BERT compatibility.

In [None]:
# Encode the dataset for BERT
encodings = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
labels_tensor = torch.tensor(labels)

### Step 3: Defining and Fine-Tuning BERT Model
Define and fine-tune a BERT model for sentiment classification.

In [None]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Prepare Trainer and TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4
)

# Create a custom dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset object
train_dataset = SentimentDataset(encodings, labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset # Pass the dataset object
)

# Fine-tune BERT
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss


TrainOutput(global_step=16, training_loss=0.6723314523696899, metrics={'train_runtime': 54.0552, 'train_samples_per_second': 1.11, 'train_steps_per_second': 0.296, 'total_flos': 431666575200.0, 'train_loss': 0.6723314523696899, 'epoch': 2.0})

### Step 4: Evaluating Model Performance
Evaluate the model on test data, using metrics for assessment.

In [None]:
# Sample test data
test_reviews = [
    "I enjoyed every moment of this film!",
    "Not worth watching, I'd skip it."
]
test_encodings = tokenizer(test_reviews, truncation=True, padding=True, return_tensors='pt')

# Evaluate
test_outputs = model(**test_encodings)
predictions = torch.argmax(test_outputs.logits, dim=-1)
print(f"Predicted Sentiments: {predictions.tolist()}")


Predicted Sentiments: [1, 1]
