# SMS Classification using Glove.6b.100d Embedding

In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np

In [7]:
# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\erfan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load dataset

In [8]:
df = pd.read_csv("1 - Text preprocessing and Representation/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
print(df.head())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


### Text preprocessing

##### Removing stopwords, numbers and (!@#$%...)
##### Tokenizing

In [9]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                        
    text = re.sub(r'[^a-z\s]', '', text)        # remove punctuation & numbers
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return tokens

df['tokens'] = df['message'].apply(clean_text)
print("\nSample tokens:\n", df.head())



Sample tokens:
   label                                            message  \
0   ham  Go until jurong point, crazy.. Available only ...   
1   ham                      Ok lar... Joking wif u oni...   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...   
3   ham  U dun say so early hor... U c already then say...   
4   ham  Nah I don't think he goes to usf, he lives aro...   

                                              tokens  
0  [go, jurong, point, crazy, available, bugis, n...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, wkly, comp, win, fa, cup, final,...  
3      [u, dun, say, early, hor, u, c, already, say]  
4  [nah, dont, think, goes, usf, lives, around, t...  


### Load Pretrained GloVe Embeddings
downloded from Stanford website (glove.6B.100d.txt)

In [None]:
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print(f"Loaded {len(embeddings_index)} word vectors.")


### Convert Each SMS to an Average GloVe Vector

converting each data sample to an embedding:
1. Get the embedding of that word from Glove
2. np.mean over the tokens in the data sample
##### to have just one embedding vector per data sample (SMS)

In [11]:
def get_glove_vector(tokens):
    vectors = [embeddings_index[w] for w in tokens if w in embeddings_index]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)   # 100 dimensions in glove.6B.100d.txt

X = np.array([get_glove_vector(tokens) for tokens in df['tokens']])
y = df['label'].map({'ham': 0, 'spam': 1}).values

print("Feature matrix shape:", X.shape)


Feature matrix shape: (5572, 100)


In [15]:
df['tokens'][0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [16]:
X[0]

array([-0.05918936,  0.07337585,  0.25856537, -0.02353659, -0.15043531,
        0.11440406,  0.04923962,  0.24415669,  0.02678226, -0.12291642,
        0.24561794, -0.0471365 , -0.07250624,  0.02821594,  0.03220782,
       -0.16815124,  0.17508031, -0.01834675, -0.25474107,  0.32546961,
        0.47869146,  0.26115426,  0.00677612, -0.11002885,  0.25488085,
        0.1717145 ,  0.10902219, -0.06052207,  0.13065593, -0.23724186,
       -0.12778981,  0.27703524,  0.07325162,  0.16444564,  0.1883592 ,
        0.24226403,  0.04159176,  0.21951935,  0.13562986, -0.26649016,
        0.04803331, -0.03659412, -0.24343425, -0.3009907 ,  0.11508217,
        0.25914812, -0.10881699, -0.18706569,  0.09885563, -0.10801812,
       -0.27193511,  0.16061346, -0.00796499,  0.17332375, -0.50743407,
       -1.35893118, -0.1233657 ,  0.21353976,  0.84468377,  0.07358542,
       -0.36472583,  0.34791341, -0.37412387, -0.15056606,  0.30511537,
        0.0726422 ,  0.18922812, -0.00659894,  0.20287395,  0.01

### Split the Data & Train Classifier 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96       966
           1       0.78      0.68      0.73       149

    accuracy                           0.93      1115
   macro avg       0.87      0.82      0.84      1115
weighted avg       0.93      0.93      0.93      1115

