# Bag Of Words (BoW) Model

In [44]:
import pandas as pd

messages = pd.read_csv('sms-spam\spam.csv', encoding='latin-1', names=['label', 'message', "hh", "hj", "uuu"])

In [21]:
messages.head()

Unnamed: 0,label,message,hh,hj,uuu
0,v1,v2,,,
1,ham,"Go until jurong point, crazy.. Available only ...",,,
2,ham,Ok lar... Joking wif u oni...,,,
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
4,ham,U dun say so early hor... U c already then say...,,,


In [45]:
messages = messages[['label', 'message']]
messages.head()

Unnamed: 0,label,message
0,v1,v2
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...


In [46]:
## Data cleaning and processing

import nltk
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\chandan
[nltk_data]     kumar/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [24]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()

In [28]:
corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

In [67]:
## for try to limitizer 
from nltk.stem import WordNetLemmatizer

lemitizer = WordNetLemmatizer()

corpus = []

for i in range(len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemitizer.lemmatize(word, pos='v') for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)

In [68]:
corpus

['v',
 'go jurong point crazy available bugis n great world la e buffet cine get amore wat',
 'ok lar joke wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf live around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune',
 'winner value network customer select receivea prize reward claim call claim code kl valid hours',
 'mobile months u r entitle update latest colour mobiles camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cry enough today',
 'six chance win cash pound txt csh send cost p day days tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw

In [69]:
len(corpus)

5573

In [None]:
## create bag of words
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(max_features=3000)

In [78]:
## also apply binary = True 
cv = CountVectorizer(max_features=100, binary=True)

In [79]:
x = cv.fit_transform(corpus).toarray()

In [80]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5573, 100))

In [81]:
x.shape

(5573, 100)

## N Gram

In [82]:
cv.vocabulary_

{'go': np.int64(24),
 'great': np.int64(26),
 'get': np.int64(22),
 'wat': np.int64(91),
 'ok': np.int64(58),
 'free': np.int64(21),
 'win': np.int64(95),
 'text': np.int64(78),
 'txt': np.int64(85),
 'say': np.int64(68),
 'already': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(29),
 'week': np.int64(93),
 'back': np.int64(4),
 'like': np.int64(40),
 'still': np.int64(74),
 'send': np.int64(70),
 'even': np.int64(18),
 'prize': np.int64(64),
 'claim': np.int64(9),
 'call': np.int64(6),
 'mobile': np.int64(49),
 'home': np.int64(31),
 'want': np.int64(90),
 'today': np.int64(82),
 'cash': np.int64(8),
 'day': np.int64(15),
 'reply': np.int64(66),
 'www': np.int64(97),
 'right': np.int64(67),
 'thank': np.int64(79),
 'take': np.int64(76),
 'time': np.int64(81),
 'use': np.int64(88),
 'message': np.int64(47),
 'com': np.int64(10),
 'oh': np.int64(57),
 'yes': np.int64(99),
 'make': np.int64(45),
 'way': np.int64(92),
 'feel': np.int64(19),
 'dont': np.int64(17),
 'miss': np.int64

In [97]:
## try n-gram with BOW model with different different scenario

## No change, it is only unigram
# cv = CountVectorizer(max_features=100, binary=True, ngram_range=(1, 1))  

## try unigram and bigram
# cv = CountVectorizer(max_features=100, ngram_range=(1, 2))
cv = CountVectorizer(max_features=200, ngram_range=(1, 2))  ## if not see any bigram then try to increase max feature

## try unigram, bigram and trigram
cv = CountVectorizer(max_features=500, ngram_range=(1, 3))

In [99]:
x = cv.fit_transform(corpus).toarray()

In [100]:
cv.vocabulary_

{'go': np.int64(161),
 'point': np.int64(332),
 'great': np.int64(168),
 'world': np.int64(483),
 'get': np.int64(157),
 'wat': np.int64(464),
 'ok': np.int64(298),
 'lar': np.int64(219),
 'wif': np.int64(473),
 'free': np.int64(148),
 'entry': np.int64(132),
 'win': np.int64(475),
 'st': np.int64(399),
 'may': np.int64(254),
 'text': np.int64(414),
 'receive': np.int64(353),
 'question': np.int64(345),
 'txt': np.int64(446),
 'rate': np.int64(347),
 'apply': np.int64(19),
 'dun': np.int64(120),
 'say': np.int64(365),
 'early': np.int64(122),
 'already': np.int64(10),
 'think': np.int64(422),
 'live': np.int64(235),
 'around': np.int64(21),
 'though': np.int64(424),
 'hey': np.int64(189),
 'week': np.int64(467),
 'word': np.int64(481),
 'back': np.int64(30),
 'like': np.int64(231),
 'fun': np.int64(155),
 'still': np.int64(402),
 'xxx': np.int64(490),
 'send': np.int64(373),
 'even': np.int64(134),
 'brother': np.int64(47),
 'speak': np.int64(396),
 'per': np.int64(314),
 'set': np.int

In [102]:
## try bigram and trigram
cv = CountVectorizer(max_features=500, ngram_range=(2, 3))
x = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'free entry': np.int64(130),
 'think go': np.int64(415),
 'claim call': np.int64(61),
 'call claim': np.int64(25),
 'claim code': np.int64(62),
 'call claim code': np.int64(26),
 'update latest': np.int64(441),
 'latest colour': np.int64(217),
 'free call': np.int64(127),
 'call mobile': np.int64(38),
 'chance win': np.int64(59),
 'win cash': np.int64(484),
 'chance win cash': np.int64(60),
 'txt word': np.int64(434),
 'dont miss': np.int64(107),
 'let know': np.int64(220),
 'feel like': np.int64(123),
 'yeah get': np.int64(497),
 'reply yes': np.int64(347),
 'go home': np.int64(159),
 'anything lor': np.int64(5),
 'call reply': np.int64(44),
 'nokia mobile': np.int64(286),
 'mobile free': np.int64(264),
 'free camcorder': np.int64(128),
 'please call': np.int64(311),
 'delivery tomorrow': np.int64(103),
 'lt gt': np.int64(236),
 'miss call': np.int64(260),
 'want go': np.int64(472),
 'first time': np.int64(126),
 'like lt': np.int64(222),
 'like lt gt': np.int64(223),
 'sms ac': np.i

In [104]:
## try only trigram
cv = CountVectorizer(max_features=100, ngram_range=(3, 3))
x = cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'like lt gt': np.int64(43),
 'sorry call later': np.int64(81),
 'please call customer': np.int64(67),
 'call customer service': np.int64(7),
 'customer service representative': np.int64(22),
 'guarantee cash prize': np.int64(34),
 'draw show prize': np.int64(23),
 'show prize guarantee': np.int64(79),
 'prize guarantee call': np.int64(72),
 'specially select receive': np.int64(83),
 'speak live operator': np.int64(82),
 'live operator claim': np.int64(45),
 'private account statement': np.int64(70),
 'account statement show': np.int64(0),
 'call identifier code': np.int64(8),
 'identifier code expire': np.int64(40),
 'bonus caller prize': np.int64(4),
 'select receive award': np.int64(78),
 'match please call': np.int64(56),
 'urgent try contact': np.int64(97),
 'lt decimal gt': np.int64(47),
 'secret admirer look': np.int64(77),
 'admirer look make': np.int64(1),
 'look make contact': np.int64(46),
 'make contact find': np.int64(55),
 'contact find reveal': np.int64(20),
 'find revea