In [26]:
import pandas as pd
import numpy as np
messages=pd.read_csv('smsspamcollection/SMSSpamCollection',
                    sep='\t',names=["label","message"])
print(messages.shape)
messages.head()



(5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
## data cleaninig and pre-processing
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
sbs=SnowballStemmer('english')
stopwords_eng=stopwords.words('english')

In [28]:
from nltk.tokenize import word_tokenize
import re
def process_text(msg):
    msg=msg.lower()
    msg = re.sub(r'[^a-z0-9\s]', '', msg)
    words=word_tokenize(msg)
    words=[sbs.stem(word) for word in words if word not in stopwords_eng]
    return " ".join(words)

In [29]:
### using lemmatization instead of stemming
from nltk.stem import WordNetLemmatizer as wnl
wnl=wnl()
def process_text_lemmatization(msg):
    msg=msg.lower()
    msg = re.sub(r'[^a-z0-9\s]', '', msg)
    words=word_tokenize(msg)
    words=[wnl.lemmatize(word) for word in words if word not in stopwords_eng]
    return " ".join(words)

In [30]:
corpus=[]
for msg in messages['message']:
    sent=process_text(msg)
    corpus.append(sent)

In [31]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkts 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chgs send 150 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc appli repli hl 4 info',
 'urg

In [32]:
corpus_lemmatized=[]
for msg in messages['message']:
    sent=process_text_lemmatization(msg)
    corpus_lemmatized.append(sent)

In [33]:
corpus_lemmatized

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s',
 'u dun say early hor u c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling 3 week word back id like fun still tb ok xxx std chgs send 150 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'im gon na home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6days 16 

### Bag of words

In [62]:
### create bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(binary=True,max_features=100)

In [63]:
X=cv.fit_transform(corpus_lemmatized).toarray()
X.shape

(5572, 100)

### N-grams

In [65]:
sorted(cv.vocabulary_.items(), key=lambda item: item[1],reverse=True)

[('yes', np.int64(99)),
 ('year', np.int64(98)),
 ('yeah', np.int64(97)),
 ('work', np.int64(96)),
 ('win', np.int64(95)),
 ('well', np.int64(94)),
 ('week', np.int64(93)),
 ('way', np.int64(92)),
 ('wat', np.int64(91)),
 ('want', np.int64(90)),
 ('wan', np.int64(89)),
 ('ur', np.int64(88)),
 ('txt', np.int64(87)),
 ('tomorrow', np.int64(86)),
 ('today', np.int64(85)),
 ('time', np.int64(84)),
 ('think', np.int64(83)),
 ('thing', np.int64(82)),
 ('thats', np.int64(81)),
 ('thanks', np.int64(80)),
 ('text', np.int64(79)),
 ('tell', np.int64(78)),
 ('take', np.int64(77)),
 ('stop', np.int64(76)),
 ('still', np.int64(75)),
 ('sorry', np.int64(74)),
 ('service', np.int64(73)),
 ('send', np.int64(72)),
 ('see', np.int64(71)),
 ('say', np.int64(70)),
 ('said', np.int64(69)),
 ('right', np.int64(68)),
 ('reply', np.int64(67)),
 ('really', np.int64(66)),
 ('prize', np.int64(65)),
 ('pls', np.int64(64)),
 ('please', np.int64(63)),
 ('pick', np.int64(62)),
 ('phone', np.int64(61)),
 ('one', np.i

In [66]:
#using only bigram and trigram
cv1=CountVectorizer(binary=True,max_features=100,ngram_range=(2,3))
X=cv1.fit_transform(corpus_lemmatized).toarray()

In [68]:
sorted(cv1.vocabulary_.items(), key=lambda item: item[1],reverse=True)

[('week txt', np.int64(99)),
 ('wat time', np.int64(98)),
 ('want go', np.int64(97)),
 ('want come', np.int64(96)),
 ('wan na', np.int64(95)),
 ('valid 12hrs', np.int64(94)),
 ('urgent mobile', np.int64(93)),
 ('ur mob', np.int64(92)),
 ('ur mate', np.int64(91)),
 ('ur friend', np.int64(90)),
 ('ur awarded', np.int64(89)),
 ('txt stop', np.int64(88)),
 ('txt nokia', np.int64(87)),
 ('trying contact', np.int64(86)),
 ('tell ur', np.int64(85)),
 ('take care', np.int64(84)),
 ('sorry ill call', np.int64(83)),
 ('sorry ill', np.int64(82)),
 ('show 800', np.int64(81)),
 ('send stop', np.int64(80)),
 ('send message', np.int64(79)),
 ('selected receive', np.int64(78)),
 ('sae tc', np.int64(77)),
 ('reply call 08000930705', np.int64(76)),
 ('reply call', np.int64(75)),
 ('prize guaranteed call', np.int64(74)),
 ('prize guaranteed', np.int64(73)),
 ('prize claim', np.int64(72)),
 ('private 2003 account', np.int64(71)),
 ('private 2003', np.int64(70)),
 ('po box', np.int64(69)),
 ('pls send mess

In [71]:
cv2=CountVectorizer(binary=True,max_features=500,ngram_range=(1,3))
X=cv2.fit_transform(corpus_lemmatized).toarray()

In [73]:
cv2.vocabulary_

{'go': np.int64(164),
 'point': np.int64(336),
 'great': np.int64(174),
 'world': np.int64(484),
 'got': np.int64(173),
 'wat': np.int64(462),
 'ok': np.int64(305),
 'lar': np.int64(225),
 'wif': np.int64(474),
 'free': np.int64(151),
 'entry': np.int64(133),
 'win': np.int64(476),
 'may': np.int64(263),
 'text': np.int64(415),
 'receive': np.int64(355),
 'txt': np.int64(446),
 'apply': np.int64(34),
 'dun': np.int64(123),
 'say': np.int64(368),
 'early': np.int64(125),
 'already': np.int64(25),
 'dont': np.int64(115),
 'think': np.int64(422),
 'life': np.int64(239),
 'around': np.int64(36),
 'though': np.int64(424),
 'hey': np.int64(195),
 'week': np.int64(466),
 'word': np.int64(481),
 'back': np.int64(46),
 'id': np.int64(206),
 'like': np.int64(240),
 'fun': np.int64(157),
 'still': np.int64(403),
 'xxx': np.int64(489),
 'send': np.int64(374),
 '150': np.int64(4),
 'even': np.int64(135),
 'brother': np.int64(61),
 'speak': np.int64(399),
 'per': np.int64(319),
 'set': np.int64(378)