# SMS Classification using Word2vec Embedding

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load dataset

In [6]:
df = pd.read_csv("1 - Text preprocessing and Representation/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


### Text preprocessing

##### Removing stopwords, numbers and (!@#$%...)
##### Tokenizing

In [9]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                        
    text = re.sub(r'[^a-z\s]', '', text)        # remove punctuation & numbers
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

df['tokens'] = df['message'].apply(clean_text)
print("\nSample tokens:\n", df.head())



Sample tokens:
   label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                              tokens  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4  [nah, dont, think, goes, usf, lives, around, t...  


### Train Word2Vec Model

In [15]:
from gensim.models import Word2Vec

sentences = df['tokens'].tolist()

w2v_model = Word2Vec(
    sentences=sentences, 
    vector_size=100,     # Dimension of Embedding
    window=5,            # number of words on each side  
    min_count=2,         # least number of occurance for each word
    sg=1,            # 1 = skip-gram; 0 = CBOW
    workers=4,           # number of CPU cores
    epochs=20
)

# Explore relationships
print("\nSimilar to 'free':")
print(w2v_model.wv.most_similar('free', topn=5))



Similar to 'free':
[('delivered', 0.7376319766044617), ('deliveredtomorrow', 0.7173013091087341), ('oranges', 0.715730607509613), ('hardcore', 0.7134705781936646), ('increments', 0.7106471061706543)]


### Create Document Vectors

##### Converting each word in the SMS to a 100-d vector of embedding 
##### Then with use of np.mean average the whole words embeddings to get each SMS an embedding

In [16]:
import numpy as np

def get_vector(tokens):
    vectors = [w2v_model.wv[w] for w in tokens if w in w2v_model.wv]
    return np.mean(vectors, axis=0) if len(vectors) > 0 else np.zeros(w2v_model.vector_size)

X = np.array([get_vector(tokens) for tokens in df['tokens']])
y = df['label'].map({'ham': 0, 'spam': 1}).values

print("\nFeature matrix shape:", X.shape)


Feature matrix shape: (5572, 100)


### Split the Data & Train Classifier 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       966
           1       0.91      0.85      0.88       149

    accuracy                           0.97      1115
   macro avg       0.94      0.92      0.93      1115
weighted avg       0.97      0.97      0.97      1115

