## Spam Ham Classification Project using BOW and TF-IDF

    1. Data Preprocessing and cleaning
    2. Train Test Split
    3. BOW, TF-IDF model creation  (sentence ---> vector) (preventing data leakage)
    4. Train model

In [22]:
## read data
import pandas as pd

messages = pd.read_csv('sms-spam/spam.csv')

In [23]:
messages = messages.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [25]:
messages.columns = ['label', 'message']

In [26]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will �_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [27]:
import nltk
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\chandan
[nltk_data]     kumar/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemitizer = WordNetLemmatizer()

In [29]:
corpus = []

for i in range(len(messages)):

    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemitizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [30]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [31]:
## output feature

y = pd.get_dummies(messages['label'])
y = y.iloc[:, 0].values

In [32]:
y

array([ True,  True, False, ...,  True,  True,  True], shape=(5572,))

In [33]:
## split train test dataset
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(corpus, y, test_size=0.20)


In [34]:
x_train

['u find time bus coz need sort stuff',
 'spoon okay',
 'get cash together text jason',
 'ur cash balance currently pound maximize ur cash send collect p msg cc po box tcr w',
 'tf p',
 'lol u drunkard hair moment yeah still tonight wats plan',
 'ok lor buy wat',
 'ugh u apologize admit u wrong ask take u back',
 'ok ur typical reply',
 'hope great new semester wish best made greatness',
 'oh grand bit party mention cover charge probably first come first served',
 'becoz lt gt jan whn al post ofice holiday cn go fr post ofice got duffer',
 'hi customer loyalty offer new nokia mobile txtauction txt word start get ctxt tc p mtmsg',
 'k sent',
 'wen ur lovable bcums angry wid u dnt take seriously coz angry childish n true way showing deep affection care n luv kettoda manda nice day da',
 'today accept day u accept brother sister lover dear best clos lvblefrnd jstfrnd cutefrnd lifpartnr belovd swtheart bstfrnd rply mean enemy',
 'ummmmmaah many many happy return day dear sweet heart happy 

In [35]:
len(x_train), len(y_train)

(4457, 4457)

### Create Bag of Words

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, ngram_range=(1,2))


In [37]:
x_train = cv.fit_transform(x_train).toarray()
x_test = cv.transform(x_test).toarray()

In [39]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 2500))

In [40]:
cv.vocabulary_

{'find': np.int64(703),
 'time': np.int64(2114),
 'bus': np.int64(225),
 'coz': np.int64(449),
 'need': np.int64(1404),
 'sort': np.int64(1924),
 'stuff': np.int64(1992),
 'okay': np.int64(1482),
 'get': np.int64(787),
 'cash': np.int64(299),
 'together': np.int64(2137),
 'text': np.int64(2068),
 'ur': np.int64(2230),
 'balance': np.int64(143),
 'currently': np.int64(465),
 'pound': np.int64(1616),
 'maximize': np.int64(1271),
 'send': np.int64(1811),
 'collect': np.int64(377),
 'msg': np.int64(1370),
 'cc': np.int64(309),
 'po': np.int64(1598),
 'box': np.int64(204),
 'ur cash': np.int64(2232),
 'cash balance': np.int64(301),
 'balance currently': np.int64(144),
 'currently pound': np.int64(466),
 'pound maximize': np.int64(1617),
 'maximize ur': np.int64(1272),
 'cash send': np.int64(304),
 'po box': np.int64(1599),
 'lol': np.int64(1192),
 'hair': np.int64(899),
 'moment': np.int64(1347),
 'yeah': np.int64(2467),
 'still': np.int64(1965),
 'tonight': np.int64(2147),
 'wats': np.int6

In [12]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: '%.3g'% x))

In [41]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 2500))

### Navie Bayes Classifier

In [42]:
from sklearn.naive_bayes import MultinomialNB

In [43]:
spam_detect_model = MultinomialNB().fit(x_train, y_train)

In [44]:
spam_detect_model

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [45]:
y_pred = spam_detect_model.predict(x_test)

In [46]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9820627802690582
[[141   7]
 [ 13 954]]
              precision    recall  f1-score   support

       False       0.92      0.95      0.93       148
        True       0.99      0.99      0.99       967

    accuracy                           0.98      1115
   macro avg       0.95      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### TF-IDF model apply

In [47]:
## first train test split then apply model 
## train test split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(corpus, y, test_size=0.20)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [50]:
x_train = tfidf.fit_transform(x_train).toarray()
x_test = tfidf.transform(x_test).toarray()

In [51]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(4457, 2500))

In [52]:
tfidf.vocabulary_

{'thanks': np.int64(2105),
 'purchase': np.int64(1670),
 'video': np.int64(2290),
 'charged': np.int64(322),
 'think': np.int64(2113),
 'better': np.int64(172),
 'send': np.int64(1834),
 'freemsg': np.int64(743),
 'record': np.int64(1721),
 'may': np.int64(1284),
 'entitled': np.int64(613),
 'pound': np.int64(1635),
 'claim': np.int64(343),
 'free': np.int64(728),
 'reply': np.int64(1738),
 'yes': np.int64(2481),
 'msg': np.int64(1383),
 'opt': np.int64(1512),
 'text': np.int64(2089),
 'stop': np.int64(1991),
 'free reply': np.int64(740),
 'reply yes': np.int64(1743),
 'text stop': np.int64(2095),
 'co': np.int64(362),
 'daddy': np.int64(471),
 'time': np.int64(2133),
 'wat': np.int64(2337),
 'fetch': np.int64(682),
 'mah': np.int64(1252),
 'wat time': np.int64(2339),
 'prize': np.int64(1654),
 'go': np.int64(809),
 'another': np.int64(66),
 'customer': np.int64(463),
 'www': np.int64(2453),
 'min': np.int64(1315),
 'ltd': np.int64(1239),
 'suite': np.int64(2023),
 'london': np.int64(1

In [53]:
from sklearn.naive_bayes import MultinomialNB

spam_tfidf_model = MultinomialNB().fit(x_train, y_train)

In [54]:
spam_tfidf_model

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [55]:
y_pred = spam_tfidf_model.predict(x_test)

In [56]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9775784753363229
[[119  24]
 [  1 971]]
              precision    recall  f1-score   support

       False       0.99      0.83      0.90       143
        True       0.98      1.00      0.99       972

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

