## Spam Classifier - Training of word2vec and average word2vec from scratch

In [87]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
import sys
!{sys.executable} -m pip install pandas nltk scikit-learn

#### 1. Label Data loading

In [89]:
import pandas as pd
messages = pd.read_csv('./Data/SMSSpamCollection.txt',sep='\t',names=["labels","message"])
msg = pd.read_csv('./Data/SMSSpamCollection.txt',sep='\t',names=["labels","message"]) #will use for lemmatization

In [90]:
messages.shape

(5572, 2)

In [91]:
messages

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


============================================================================

#### 2. Data Cleaning and Preprocessing
- Using regex to clean special char
- Using stopwords to clean insignificant words
- Lowering the case
- Stemming to reduce vocabulary and converting to root base word

In [None]:
import re
import nltk
nltk.download('stopwords')

In [93]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [94]:
corpus = []
for i in range(0, len(messages )):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i]) # removing special char other than a to z
    review = review.lower()  # lowering case
    review = review.split()  # Getting all words as list from sentences or document
    # stemming and reducing stop words adding stop word can reduce the entire sentence so very careful
    # review = [ps.stem(word) for word in review if not word in stopwords.words('english')] 
    review = [ps.stem(word) for word in review] 
    review = ' '.join(review)
    if(review): # if review is valid non empty then only add
        corpus.append(review)
    else:
        print(messages['message'][i])        
        messages.drop(i, inplace=True) # if any emogi or any unnecessary char present then drop it        
        print("------------")


:) 
------------
:-) :-)
------------


In [95]:
corpus[:20] #checking top 20

['go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat',
 'ok lar joke wif u oni',
 'free entri in 2 a wkli comp to win fa cup final tkt 21st may 2005 text fa to 87121 to receiv entri question std txt rate t c s appli 08452810075over18 s',
 'u dun say so earli hor u c alreadi then say',
 'nah i don t think he goe to usf he live around here though',
 'freemsg hey there darl it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chg to send 1 50 to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'as per your request mell mell oru minnaminungint nurungu vettam ha been set as your callertun for all caller press 9 to copi your friend callertun',
 'winner as a valu network custom you have been select to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour onli',
 'had your mobil 11 month or more u r entitl to updat to the latest colour mobil wit

Check if any empty list present or not

In [96]:
len(corpus)

5570

In [97]:
corpus = [sent for sent in corpus if sent] # removing empty list if any

In [98]:
len(corpus)

5570

============================================================================

### 3. Creating Bag of words

In [99]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True, ngram_range=(2,2)) # taking max 2500 occuring feature
X = cv.fit_transform(corpus).toarray()

In [100]:
X.shape

(5570, 2500)

In [101]:
X[1]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

============================================================================

### 4. One hot Label encoding for y feature like target like spam or Ham

In [102]:
y=pd.get_dummies(messages['labels'])
y.shape

(5570, 2)

In [103]:
y=y.iloc[:,1].values

In [104]:
y

array([False, False,  True, ..., False, False, False])

### 5. Train Test Split

In [105]:

# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [106]:
X_train,y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([False, False, False, ...,  True, False, False]))

=======================================================================================

### 6. Train the ML using Multinomial naive bayes

In [107]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

=========================================================================================

### 7. Prediction and performance

In [108]:
y_pred=spam_detect_model.predict(X_test)

In [109]:
from sklearn.metrics import accuracy_score,classification_report

In [110]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9712746858168761


In [111]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.97      0.98       974
        True       0.84      0.95      0.89       140

    accuracy                           0.97      1114
   macro avg       0.92      0.96      0.94      1114
weighted avg       0.97      0.97      0.97      1114



====================================================================================

### 8. Using TF-IDF Model

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [113]:
# train and split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [114]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [115]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [116]:
score=accuracy_score(y_test,y_pred)
print(score)

0.9820466786355476


In [117]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       976
        True       0.87      1.00      0.93       138

    accuracy                           0.98      1114
   macro avg       0.94      0.99      0.96      1114
weighted avg       0.98      0.98      0.98      1114



Using TF-IDF accuracy improved from 0.97 to 0.98 % we can also use other classifer like random forest

In [118]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)

In [119]:
#prediction
y_pred=classifier.predict(X_test)

In [120]:
accuracy_score(y_pred,y_test)

0.9829443447037702

In [121]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       975
        True       0.88      1.00      0.94       139

    accuracy                           0.98      1114
   macro avg       0.94      0.99      0.96      1114
weighted avg       0.98      0.98      0.98      1114



Better than Multinomial Naive Bayes

### 9. Word2vec Implementation

In [122]:
print(len(X))

5570


In [None]:
import sys
!{sys.executable} -m pip install gensim

- We can use gensim either for using pre trained model or train a model from scratch

In [124]:
import gensim

In [125]:
# we can use pretrained model but we will create from scratch

# import gensim.downloader as api

# wv = api.load('word2vec-google-news-300')

#### 9.1 Using lemmatizar

In [126]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [127]:
len(corpus)

5570

In [128]:
msg.shape

(5572, 2)

In [129]:
corpus_lem = []
for i in range(0, len(msg)):
    review = re.sub('[^a-zA-Z]', ' ', msg['message'][i])
    review = review.lower()
    review = review.split()    
    # review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    if(review):
        corpus_lem.append(review)
    else:
        print(msg['message'][i])
        msg.drop(i, inplace=True)

645
:) 
:-) :-)


In [130]:
corpus[:20]

['go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat',
 'ok lar joke wif u oni',
 'free entri in 2 a wkli comp to win fa cup final tkt 21st may 2005 text fa to 87121 to receiv entri question std txt rate t c s appli 08452810075over18 s',
 'u dun say so earli hor u c alreadi then say',
 'nah i don t think he goe to usf he live around here though',
 'freemsg hey there darl it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chg to send 1 50 to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'as per your request mell mell oru minnaminungint nurungu vettam ha been set as your callertun for all caller press 9 to copi your friend callertun',
 'winner as a valu network custom you have been select to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour onli',
 'had your mobil 11 month or more u r entitl to updat to the latest colour mobil wit

In [131]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [132]:
corpus[0]

'go until jurong point crazi avail onli in bugi n great world la e buffet cine there got amor wat'

In [133]:
len(corpus)

5570

### 9.2 Lowering and tokenizing the sentences from corpus

In [134]:
words=[]
for index,sent in enumerate(corpus): 
    sent_token=sent_tokenize(sent)
    if sent_token:
        for sentence_token  in sent_token:
            words.append(simple_preprocess(sentence_token ))  # lowering each words
    else:
        print("sent_token",index, corpus[index])


In [135]:
len(words)

5570

All unique words of corpus

In [136]:
words[:20]

[['go',
  'until',
  'jurong',
  'point',
  'crazi',
  'avail',
  'onli',
  'in',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'there',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'in',
  'wkli',
  'comp',
  'to',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'to',
  'to',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli',
  'over'],
 ['dun', 'say', 'so', 'earli', 'hor', 'alreadi', 'then', 'say'],
 ['nah',
  'don',
  'think',
  'he',
  'goe',
  'to',
  'usf',
  'he',
  'live',
  'around',
  'here',
  'though'],
 ['freemsg',
  'hey',
  'there',
  'darl',
  'it',
  'been',
  'week',
  'now',
  'and',
  'no',
  'word',
  'back',
  'like',
  'some',
  'fun',
  'you',
  'up',
  'for',
  'it',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'to',
  'send',
  'to',
  'rcv'],
 ['even',
  'my',
  'brother',
  'is',
  'not',
  'like',
  'to',
  'speak',
  'with',

In [137]:
import gensim

#### 9.3 Train word2vec from scratch

In [138]:
# This will create the feature and output based on window size and train the model to represent each word as vector dimension
model =gensim.models.Word2Vec(words,window=5,min_count=2)


In [139]:
len(words)

5570

In [140]:
model.wv.index_to_key[:20] # all vocabulary

['to',
 'you',
 'the',
 'it',
 'and',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'that',
 'of',
 'have',
 'on',
 'now',
 'do',
 'are',
 'can']

In [141]:
model.corpus_count #Total vocabulary

5570

In [142]:
model.epochs 

5

In [143]:
model.wv.similar_by_word('prize')

[('guarante', 0.9986960887908936),
 ('claim', 0.9986200928688049),
 ('cash', 0.9973024725914001),
 ('won', 0.9966235160827637),
 ('call', 0.9964011311531067),
 ('award', 0.9962254166603088),
 ('mobil', 0.9961954951286316),
 ('txt', 0.9959759712219238),
 ('valid', 0.9959015846252441),
 ('free', 0.9956607222557068)]

In [144]:
model.wv.similar_by_word('prize')

[('guarante', 0.9986960887908936),
 ('claim', 0.9986200928688049),
 ('cash', 0.9973024725914001),
 ('won', 0.9966235160827637),
 ('call', 0.9964011311531067),
 ('award', 0.9962254166603088),
 ('mobil', 0.9961954951286316),
 ('txt', 0.9959759712219238),
 ('valid', 0.9959015846252441),
 ('free', 0.9956607222557068)]

By Default every word will have 100 dimension based on the algorithm irrespective of the size of the sentences

In [145]:
model.wv['kid'].shape

(100,)

In [146]:
model.wv['kid']

array([-0.02537386,  0.04020482,  0.03166584,  0.00650899,  0.02483618,
       -0.18237102,  0.05610189,  0.192596  , -0.08526697, -0.05595316,
       -0.0710955 , -0.1616685 , -0.06832775,  0.04895108,  0.02452326,
       -0.07114151,  0.02100999, -0.08807635, -0.02583958, -0.23801003,
        0.08337916,  0.0595209 ,  0.06847692, -0.0857102 , -0.04407695,
       -0.02177232, -0.0964246 , -0.03618133, -0.05033445, -0.02138399,
        0.12706777,  0.03418126,  0.02931222, -0.09569122, -0.02930906,
        0.07957319, -0.00145616, -0.06761494, -0.02563707, -0.14360695,
       -0.00593707, -0.10737401, -0.04574689, -0.00294601,  0.09299698,
       -0.01344958, -0.08719249, -0.0125577 ,  0.03386301,  0.04250006,
        0.04450629, -0.08727499, -0.02271529,  0.02027864, -0.06257565,
        0.06349179,  0.07239536, -0.00720087, -0.09593702,  0.0771193 ,
        0.02238348, -0.03031101, -0.00162736, -0.00869755, -0.09688896,
        0.09872153,  0.03876401,  0.1217334 , -0.14175056,  0.13

#### 9.4 Average word2vec

we need to calculate average as input would be a message which can have n no of word each represented by 100 dimension for us input dimension will be fixed that is 100 dimension so taking average

In [None]:
import sys
!{sys.executable} -m pip install numpy tqdm

In [148]:
import numpy as np
from tqdm import tqdm

In [150]:
words[0] # this is my first sentence

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat']

Apply average word2vec in entire sentences

tqdm is a Python library for adding dynamic progress bars to loops and iterable processes, providing visual feedback on the progress of operations. It enhances the user experience when dealing with tasks that may take some time to complete

In [149]:
def avg_word2vec(doc):
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [159]:
X_avg=[]
y_filtered = []
for i in tqdm(range(len(words))):
    avg_word = avg_word2vec(words[i])
    if not np.any(np.isnan(avg_word)):
        X_avg.append(avg_word)
        y_filtered.append(y[i])
    else:       
       print(avg_word,i)
       print(words[i])

 25%|██▌       | 1394/5570 [00:00<00:00, 4818.34it/s]

nan 451
['hank', 'lotsli']
nan 783
['beerag']


 46%|████▌     | 2567/5570 [00:00<00:00, 5463.15it/s]

nan 1612
[]


 86%|████████▋ | 4810/5570 [00:00<00:00, 5407.35it/s]

nan 4292
[]
nan 4479
['erutupalam', 'thandiyachu']
nan 5171
[]


100%|██████████| 5570/5570 [00:01<00:00, 4366.04it/s]


In [160]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazi',
 'avail',
 'onli',
 'in',
 'bugi',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amor',
 'wat']

For the first sentence as above we will get 100 dimension as below after average word2vec which will be input feature

In [None]:
X_avg[0] #First sentence is having 100 dimension this is my input feature

array([-0.04366148,  0.11668544,  0.07114939,  0.01470136,  0.04185769,
       -0.5680695 ,  0.16200317,  0.63781804, -0.28876436, -0.14762105,
       -0.24379629, -0.42528337, -0.23411198,  0.1165799 ,  0.02059597,
       -0.20298767,  0.09633525, -0.26017863, -0.02152736, -0.76236796,
        0.27895218,  0.21326505,  0.20262623, -0.26957974, -0.13466471,
       -0.04359645, -0.28080478, -0.1077365 , -0.12348515,  0.00242672,
        0.3634307 ,  0.09953444,  0.10516998, -0.27315974, -0.07501835,
        0.26285535,  0.03683966, -0.24019669, -0.12278247, -0.4339307 ,
       -0.0358152 , -0.33958405, -0.07211695,  0.03604535,  0.26791713,
       -0.07560895, -0.29079822, -0.04241047,  0.13183649,  0.18005455,
        0.09787506, -0.27541685, -0.05381407,  0.05990165, -0.18866974,
        0.20182894,  0.1733481 , -0.04914084, -0.3553083 ,  0.19725378,
        0.10818958, -0.07198407,  0.003163  ,  0.0194209 , -0.29225093,
        0.30773202,  0.16519828,  0.37706205, -0.43621364,  0.42

In [None]:
X_avg[0].shape

(100,)

In [165]:
import numpy as np
X_avg = [arr for arr in X_avg if not np.any(np.isnan(arr))]  # remove any more nan

In [166]:
print(len(X_avg))
print(len(y_filtered))
print(len(X))
print(len(y))


5564
5564
5570
5570


### 9.5 Train Test Split

In [171]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_avg_train, X_avg_test, y_avg_train, y_avg_test = train_test_split(X_avg, y_filtered, test_size = 0.20, random_state = 0)

In [172]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_avg_train,y_avg_train)

In [173]:
#prediction
y_pred=classifier.predict(X_avg_test)

In [174]:
accuracy_score(y_pred,y_avg_test)

0.9694519317160827

In [175]:
print(classification_report(y_pred,y_avg_test))

              precision    recall  f1-score   support

       False       0.99      0.97      0.98       977
        True       0.82      0.96      0.88       136

    accuracy                           0.97      1113
   macro avg       0.91      0.96      0.93      1113
weighted avg       0.97      0.97      0.97      1113



In [179]:
new_word = []
new_email = "Hey you have won a lottery"
sent_token = sent_tokenize(new_email)

if sent_token:
    for sentence_token in sent_token:
        new_word.extend(simple_preprocess(sentence_token))  # extending the list instead of appending

print(new_word)


['hey', 'you', 'have', 'won', 'lottery']


In [181]:
# This will create the feature and output based on window size and train the model to represent each word as vector dimension
model1 =gensim.models.Word2Vec(new_word,window=5,min_count=2)

In [182]:
model1


<gensim.models.word2vec.Word2Vec at 0x23fc9a84e90>