### NLP: Spam Classification : Bow | TF-IDF | Word2Vec: Build from Scratch
##### Saurabh Chatterjee

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Spam Classifier Text Data
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=["label", "message"]) # label and input text separated by Tab Space ('\t')
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


#### Data Cleaning and Preprocessing

In [3]:
import re       # regular expression
import nltk

# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()               # for Stemming
lemmatizer = WordNetLemmatizer()        # for Lemmatization

In [4]:
corpus = []

for i in range (0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])       # Replace Characters "OTHER THAN' (^) a-z and A-Z in the sentence   (Cleaning)
    review = review.lower()     # lower the Case
    review = review.split()     # Get the WORDS as a LIST (Split based on Space)

    # Removing Stop-Words and LEMMATIZATION:
    review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
# After Lemmatization, some sentences in the corpus are turned to blanks. Those are: **
[[len, text] for len, text in zip(list(map(len, corpus)), messages['message']) if len<1]

[[0, 'What you doing?how are you?'],
 [0, 'Where @'],
 [0, '645'],
 [0, 'Can a not?'],
 [0, ':) '],
 [0, 'What you doing?how are you?'],
 [0, ':( but your not here....'],
 [0, ':-) :-)']]

In [6]:
# REMOVE those BLANK Data from corpus **
corpus_cleaned = [sentence for sentence in corpus if len(sentence)>=1]

# REMOVING Corresponding LABEL Data also **
labels_cleaned = [label for (i, label) in enumerate(messages['label']) if len(corpus[i])>=1]

In [7]:
# Storing Labels as Binary (Y) Separately:
# y = pd.get_dummies(messages['label'])       
y = pd.get_dummies(labels_cleaned)       # converts/splits categorical data into indicator variables, each binary (One-Hot)
print(y)

y = y.iloc[:, 1].values     # Selecting one column with 0/1 label (Spam/Ham)

        ham   spam
0      True  False
1      True  False
2     False   True
3      True  False
4      True  False
...     ...    ...
5559  False   True
5560   True  False
5561   True  False
5562   True  False
5563   True  False

[5564 rows x 2 columns]


In [8]:
corpus_cleaned[:10]

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free']

#### (1) Bag of Words

In [121]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer(max_features=2500, ngram_range=(1,2), binary=True)     # create BINARY Vector (Present/Not Present) instead of Count
X_bow = count_vec.fit_transform(corpus_cleaned).toarray()

In [122]:
X_bow.shape

(5564, 2500)

In [123]:
# Train Test Split
from sklearn.model_selection import train_test_split

X_bow_train, X_bow_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=0)

In [124]:
from sklearn.naive_bayes import MultinomialNB       # Multinomial Near Bias

spam_detector_model = MultinomialNB().fit(X_bow_train, y_train)

# Prediction
y_pred_bow = spam_detector_model.predict(X_bow_test)

In [125]:
# Score
from sklearn.metrics import accuracy_score, classification_report

score_bow = accuracy_score(y_test, y_pred_bow)
print("Accuracy Score: ", score_bow)

print(classification_report(y_test, y_pred_bow))

Accuracy Score:  0.9847259658580413
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       962
        True       0.95      0.93      0.94       151

    accuracy                           0.98      1113
   macro avg       0.97      0.96      0.97      1113
weighted avg       0.98      0.98      0.98      1113



#### (2) TF-IDF

In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer

count_vec_ti = TfidfVectorizer(max_features=2500, ngram_range=(1,2))        # *ngram_range: (1 to 2): consider single word and BI-GRAMS
X_ti = count_vec_ti.fit_transform(corpus_cleaned).toarray()

In [127]:
# Train Test Split
X_ti_train, X_ti_test, y_train, y_test = train_test_split(X_ti, y, test_size=0.2, random_state=0)

In [128]:
from sklearn.naive_bayes import MultinomialNB       # Multinomial Near Bias

# Naive Bayes:
spam_detector_model_ti = MultinomialNB().fit(X_ti_train, y_train)

# Prediction
y_pred_ti = spam_detector_model_ti.predict(X_ti_test)

In [129]:
from sklearn.metrics import accuracy_score, classification_report

score_ti = accuracy_score(y_test, y_pred_ti)
print(score_ti)

print(classification_report(y_test, y_pred_ti))

0.9820305480682839
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       962
        True       0.99      0.88      0.93       151

    accuracy                           0.98      1113
   macro avg       0.98      0.94      0.96      1113
weighted avg       0.98      0.98      0.98      1113



In [130]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

spam_detector_model_ti_rf = RandomForestClassifier()
spam_detector_model_ti_rf.fit(X_ti_train, y_train)

# Prediction
y_pred_ti_rf = spam_detector_model_ti_rf.predict(X_ti_test)

score_ti_rf = accuracy_score(y_test, y_pred_ti_rf)
print(score_ti_rf)

print(classification_report(y_test, y_pred_ti_rf))

0.9847259658580413
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       962
        True       1.00      0.89      0.94       151

    accuracy                           0.98      1113
   macro avg       0.99      0.94      0.97      1113
weighted avg       0.98      0.98      0.98      1113



#### (3) Word2Vec: Building and Training from Scratch

##### Word2Vec gives high-dimension vector for 'each word' (E.g Gensim Google-News trained model gives 300-Dimension Vector for each word) which is too large. To solve this problem:  'AVG Word2Vec'
##### <b>AVG Word2Vec</b> : Overall 300-Dimension Vector is created for WHOLE SENTENCE by Adding Vector Values corresponding to Each Position of Each Word 300-Dim Vector.

In [None]:
import gensim
import gensim.downloader as api

# Loading Word2Vec by Google PRE-TRAINED on Google News text data: Returns 300-Dimension Vector
wordvec = api.load('word2vec-google-news-300')      # will download 1662.8MB file

# Can also Train Word2Vec through Gensim with Our Own Data

In [None]:
vec_king = wordvec['king']      # 300-dimention vector
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [10]:
from nltk import sent_tokenize                  # returns LIST of Sentences
from gensim.utils import simple_preprocess      # returns LIST of Words (after converting to lowercase) in the Sentence

In [63]:
corpus_cleaned[0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

In [11]:
sent_token=sent_tokenize(corpus_cleaned[0])
sent_token

['go jurong point crazy available bugis n great world la e buffet cine got amore wat']

In [12]:
words=[]        # List of List of Words (Sentence Words)
for sentences in corpus_cleaned:
    sent_token = sent_tokenize(sentences)   # returns List of Sentences
    for sentence in sent_token:
        words.append(simple_preprocess(sentence))   # returns LIST of Words (after converting to lowercase) in the Sentence

In [14]:
words[:2]       # first two sentences into word tokens

[['go',
  'jurong',
  'point',
  'crazy',
  'available',
  'bugis',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amore',
  'wat'],
 ['ok', 'lar', 'joking', 'wif', 'oni']]

In [16]:
# Training Word2Vec from Scratch:
import gensim
from gensim.models import Word2Vec

word2vec_model = Word2Vec(words, window=5, min_count=2)     # default vector_size=100, min_count=2 (ignore words with freq <2)


In [19]:
# List of all Vocabulary
word2vec_model.wv.index_to_key[:20]     # printing 20 words from the vocabulary

['call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'day',
 'free',
 'know',
 'come',
 'like',
 'time',
 'good',
 'got',
 'love',
 'text',
 'want',
 'send',
 'need']

In [135]:
# Total Vocabulary Size
word2vec_model.corpus_count

5564

In [136]:
# Number of Epochs took to Train
word2vec_model.epochs

5

In [162]:
# Finding Similar Words with Similarity Scores
word2vec_model.wv.similar_by_word('prize')

[('claim', 0.9995064735412598),
 ('line', 0.9992116093635559),
 ('call', 0.9991533160209656),
 ('land', 0.9990938305854797),
 ('guaranteed', 0.9990529417991638),
 ('show', 0.9989519119262695),
 ('draw', 0.998933732509613),
 ('cash', 0.9989280700683594),
 ('hr', 0.9988723993301392),
 ('code', 0.9988066554069519)]

In [188]:
# AVGWord2Vec:

def avg_word2vec(doc):      # Returns a 100-dimension Vector for Each Sentence

    doc_vec = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv.index_to_key]
    
    if doc_vec: 
        return np.mean(doc_vec, axis=0)     # return a 100-dimension Vector for the Sentence
    else:
        return None
# * Returns NaN if None of the Words of the Sentence (doc) is Present in the Dictionary **

In [139]:
from tqdm import tqdm       # Progess Meter

In [189]:
sentence_vector = avg_word2vec(words[73])
sentence_vector

In [205]:
#apply for the entire sentences
X_wv=[]
y_wv = []
for i in tqdm(range(len(words))):
    sentence_vector = avg_word2vec(words[i])
    if sentence_vector is not None:     # if NOT NaN ***
        X_wv.append(sentence_vector)        # 300-dimension vector for each Sentence
        y_wv.append(y[i])

100%|██████████| 5564/5564 [00:00<00:00, 12214.72it/s]


In [206]:
np.shape(X_wv)

(5541, 100)

In [193]:
# check if any vector has abnormal size (!= 100)
[[i, sent, sent.size] for (i, sent) in enumerate(X_wv) if sent.size<100]

[]

In [155]:
X_wv[0].size      # 100-dimension vector for a sentence

100

In [207]:
# Train-Test Split
X_wv_train, X_wv_test, y_train, y_test = train_test_split(X_wv, y_wv, test_size=0.2)

In [208]:
# Random Forest: Training using Vectors Generated by Our Own Trained Word2Vec (using AvgWord2Vec)
from sklearn.ensemble import RandomForestClassifier

spam_detector_model_wv = RandomForestClassifier()
spam_detector_model_wv.fit(X_wv_train, y_train)

# Prediction
y_pred_wv = spam_detector_model_wv.predict(X_wv_test)

score_wv = accuracy_score(y_test, y_pred_wv)
print(score_wv)

print(classification_report(y_test, y_pred_wv))

0.9594229035166817
              precision    recall  f1-score   support

       False       0.97      0.99      0.98       960
        True       0.91      0.78      0.84       149

    accuracy                           0.96      1109
   macro avg       0.94      0.88      0.91      1109
weighted avg       0.96      0.96      0.96      1109

