In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['Label', 'Message']

# Preprocessing: lowercase, tokenize, remove stopwords, keep alpha only
def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.isalpha() and word not in stop_words]

# Apply preprocessing
df['Tokens'] = df['Message'].apply(preprocess_text)

# Preview result
print(df[['Message', 'Tokens']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                             Message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                              Tokens  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4     [nah, think, goes, usf, lives, around, though]  


### Step 1: Preprocessing Messages

We begin by loading the dataset and preprocessing each SMS message. This includes:
- Lowercasing the text
- Tokenizing using NLTK
- Removing English stopwords
- Filtering out non-alphabetic tokens

The cleaned token list for each message is stored in a new `Tokens` column.


In [2]:
from gensim.models import KeyedVectors

# Load the Google News Word2Vec model from local binary file
w2v_model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz',
    binary=True
)

# Confirm model details
print(f"Model loaded. Vector size: {w2v_model.vector_size}")


Model loaded. Vector size: 300


### (Python 3.11 Environment)
### Step 2: Load Pre-trained Word2Vec Model

We load the 300-dimensional Word2Vec model trained on Google News using `gensim`'s `KeyedVectors.load_word2vec_format`. This model contains over 3 million word and phrase vectors and allows us to convert tokens into dense embeddings.

Make sure the `.bin.gz` file is present locally in your working directory. Once loaded, we can access each word’s corresponding vector using `w2v_model[word]`.


In [3]:
import numpy as np

# Function: Average vectors of valid tokens
def get_average_vector(tokens, model, vector_size):
    vectors = [model[token] for token in tokens if token in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Vector size from model
vector_size = w2v_model.vector_size

# Apply to each message
df['Vector'] = df['Tokens'].apply(lambda tokens: get_average_vector(tokens, w2v_model, vector_size))

# Show a sample
print(df[['Message', 'Vector']].head())


                                             Message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                              Vector  
0  [-0.019805908, 0.05167062, 0.02709961, 0.21868...  
1  [-0.06323496, 0.0803833, 0.060943604, 0.102498...  
2  [-0.03242302, -0.0050720214, -0.06273012, 0.11...  
3  [-0.06568061, 0.0262146, 0.1081543, 0.0869751,...  
4  [0.032470703, 0.037462506, 0.047345843, 0.1572...  


### Step 3: Convert Messages to Word2Vec Vectors

We represent each message as the average of its Word2Vec token embeddings. For each token present in the model, we fetch its vector. If no token is found, a zero vector is used. This results in a uniform 300-dimensional vector per message, ready for classification.


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Convert labels to binary
le = LabelEncoder()
df['LabelEncoded'] = le.fit_transform(df['Label'])  # ham = 0, spam = 1

# Prepare feature matrix and labels
X = np.vstack(df['Vector'].values)
y = df['LabelEncoded'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

         ham       0.96      0.98      0.97       965
        spam       0.82      0.72      0.77       150

    accuracy                           0.94      1115
   macro avg       0.89      0.85      0.87      1115
weighted avg       0.94      0.94      0.94      1115



### Step 4: Classification with Logistic Regression

We encode labels, split the data, train a logistic regression model, and evaluate it using a classification report. This gives us metrics like precision, recall, and F1-score for both spam and ham messages.

#### Classification Report

The logistic regression classifier achieved 94% overall accuracy. While it performs well on ham messages, the recall for spam messages (72%) could be improved with more advanced classifiers or oversampling techniques. Still, this demonstrates a strong baseline for spam detection using Word2Vec embeddings.


In [5]:
def predict_message_class(msg):
    # Preprocess the message
    tokens = preprocess_text(msg)  # use same preprocessing as before

    # Convert to vector
    vector = get_average_vector(tokens, w2v_model, w2v_model.vector_size).reshape(1, -1)

    # Predict
    prediction = clf.predict(vector)[0]
    return prediction


In [6]:
print(predict_message_class("Congratulations! You have won a free ticket to Bahamas."))  # likely 1 (spam)
print(predict_message_class("Can we meet tomorrow at 10?"))  # likely 0 (ham)


1
0


### Step 5: Predict Message Class

We define `predict_message_class(msg)` which accepts a raw SMS message string. It applies the same preprocessing and vectorization steps used during training, then uses the trained logistic regression classifier to return:
- `0` for ham
- `1` for spam
