<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_Week6_Text_Representation_and_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INFO 371 - Week 6: Text Representation and ML Classification
This notebook demonstrates the following:
- Dataset generation
- Bag of Words and TF-IDF
- Cosine similarity
- Text embeddings
- Spam detection using Naive Bayes

In [7]:
import pandas as pd
import random

# Base messages
ham = [
    "Hey, are we still meeting up later today?",
    "Don't forget about the meeting tomorrow morning.",
    "I'll be home around 6 PM, want to grab dinner?",
    "Can you send me the report by tonight?",
    "What time is the soccer game on Saturday?",
    "Sure, I'll call you once I'm done.",
    "Happy birthday! Hope you have a great one.",
    "Let me know if you need a ride to work.",
    "I’m stuck in traffic, might be a few minutes late.",
    "Do we still need to bring snacks for the event?"
]

spam = [
    "WIN a brand new car! Text WIN to 90022 now!",
    "Congratulations! You've won a $1000 gift card. Call now!",
    "Claim your free prize by visiting our website now!",
    "Get rich quick! Learn how with this limited offer.",
    "You've been selected for a free vacation to the Bahamas!",
    "Hurry up! Your free Bitcoin is waiting.",
    "Act now to secure your financial freedom!",
    "Click here to claim your free iPhone!",
    "You are the lucky winner of our weekly sweepstakes!",
    "This is not a scam. You've won big—check your inbox!"
]

# Expand to 100 rows (50 ham, 50 spam)
dataset = [["ham", msg] for _ in range(5) for msg in ham] + [["spam", msg] for _ in range(5) for msg in spam]
random.shuffle(dataset)

df = pd.DataFrame(dataset, columns=["label", "text"])
df.to_csv("large_text_spam_dataset.csv", index=False)
df.head()

Unnamed: 0,label,text
0,spam,Get rich quick! Learn how with this limited of...
1,ham,"Hey, are we still meeting up later today?"
2,ham,"I’m stuck in traffic, might be a few minutes l..."
3,spam,Claim your free prize by visiting our website ...
4,ham,"Hey, are we still meeting up later today?"


In [8]:
# Step 2: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])
y = df['label']
print("Vocabulary:", vectorizer.vocabulary_)
X.toarray()

Vocabulary: {'rich': 51, 'quick': 49, 'learn': 36, 'limited': 38, 'offer': 46, 'hey': 27, 'meeting': 41, 'later': 35, 'today': 65, 'stuck': 60, 'traffic': 68, 'minutes': 42, 'late': 34, 'claim': 12, 'free': 20, 'prize': 48, 'visiting': 71, 'website': 74, 've': 70, 'selected': 56, 'vacation': 69, 'bahamas': 3, 'let': 37, 'know': 33, 'need': 44, 'ride': 52, 'work': 79, 'act': 2, 'secure': 55, 'financial': 18, 'freedom': 21, 'lucky': 40, 'winner': 77, 'weekly': 75, 'sweepstakes': 62, 'send': 57, 'report': 50, 'tonight': 67, 'win': 76, 'brand': 7, 'new': 45, 'car': 9, 'text': 63, '90022': 1, 'don': 16, 'forget': 19, 'tomorrow': 66, 'morning': 43, 'll': 39, 'home': 28, 'pm': 47, 'want': 73, 'grab': 24, 'dinner': 15, 'click': 13, 'iphone': 32, 'hurry': 30, 'bitcoin': 6, 'waiting': 72, 'time': 64, 'soccer': 59, 'game': 22, 'saturday': 53, 'bring': 8, 'snacks': 58, 'event': 17, 'sure': 61, 'scam': 54, 'won': 78, 'big': 4, 'check': 11, 'inbox': 31, 'happy': 26, 'birthday': 5, 'hope': 29, 'great

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.43598746, 0.        , 0.        , ..., 0.        , 0.36686827,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [9]:
# Step 3: Cosine Similarity Example
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(X[0:1], X[1:2])
print("Cosine similarity between doc 0 and doc 1:", similarity[0][0])

Cosine similarity between doc 0 and doc 1: 0.0


In [10]:
# Step 4: Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Step 5: Train a Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [12]:
# Step 6: Evaluate the Classifier
from sklearn.metrics import accuracy_score, classification_report
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

         ham       1.00      1.00      1.00        13
        spam       1.00      1.00      1.00        17

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

