In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import re
import numpy as np
from nltk.stem import WordNetLemmatizer

In [3]:
messages = pd.read_csv('../data/spam.csv', encoding='latin-1', sep=',')
messages = messages[['v1', 'v2']]
messages = messages.rename(columns={ 'v1': 'label', 'v2': 'messages' })
messages

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# performing lemmatization
from nltk.stem import WordNetLemmatizer

# create a lemmatizer object
lemmatizer = WordNetLemmatizer()
lemma_corpus = []
for i in range(0, len(messages)):
    
	result = re.sub('[^a-zA-Z]', ' ', messages['messages'][i])
	result = result.lower() # convert to lower case
	# split the result
	result = result.split()

	result = [lemmatizer.lemmatize(word, pos='v') for word in result if word not in stopwords.words('english')]

	result = ' '.join(result)
	lemma_corpus.append(result)

In [None]:
# bag of words but with ngrams - UNIGRAM
# create new bag of words with lemmatizer 
lemma_cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1, 1))
X = lemma_cv.fit_transform(lemma_corpus).toarray()
lemma_cv.vocabulary_

{'go': np.int64(870),
 'point': np.int64(1574),
 'crazy': np.int64(493),
 'available': np.int64(186),
 'bugis': np.int64(319),
 'great': np.int64(898),
 'world': np.int64(2427),
 'la': np.int64(1137),
 'cine': np.int64(413),
 'get': np.int64(852),
 'wat': np.int64(2334),
 'ok': np.int64(1468),
 'lar': np.int64(1145),
 'joke': np.int64(1103),
 'wif': np.int64(2385),
 'oni': np.int64(1477),
 'free': np.int64(781),
 'entry': np.int64(656),
 'wkly': np.int64(2413),
 'comp': np.int64(446),
 'win': np.int64(2391),
 'cup': np.int64(506),
 'final': np.int64(739),
 'tkts': np.int64(2116),
 'st': np.int64(1936),
 'may': np.int64(1292),
 'text': np.int64(2053),
 'receive': np.int64(1681),
 'question': np.int64(1644),
 'std': np.int64(1947),
 'txt': np.int64(2199),
 'rate': np.int64(1662),
 'apply': np.int64(132),
 'dun': np.int64(618),
 'say': np.int64(1772),
 'early': np.int64(622),
 'already': np.int64(85),
 'nah': np.int64(1402),
 'think': np.int64(2078),
 'usf': np.int64(2262),
 'live': np.in

In [None]:
# bag of words but with ngrams - UNIGRAM AND BIGRAM
# create new bag of words with lemmatizer
# lemma_cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1, 2))
lemma_cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(2, 2)) # ignoring unigram
X = lemma_cv.fit_transform(lemma_corpus).toarray()
lemma_cv.vocabulary_

{'ok lar': np.int64(1425),
 'free entry': np.int64(612),
 'entry wkly': np.int64(539),
 'wkly comp': np.int64(2417),
 'cup final': np.int64(399),
 'tkts st': np.int64(2119),
 'std txt': np.int64(1957),
 'txt rate': np.int64(2196),
 'rate apply': np.int64(1702),
 'already say': np.int64(27),
 'think go': np.int64(2077),
 'freemsg hey': np.int64(633),
 'like fun': np.int64(1086),
 'per request': np.int64(1479),
 'request melle': np.int64(1757),
 'melle melle': np.int64(1225),
 'melle oru': np.int64(1226),
 'oru minnaminunginte': np.int64(1457),
 'minnaminunginte nurungu': np.int64(1248),
 'nurungu vettam': np.int64(1404),
 'vettam set': np.int64(2285),
 'set callertune': np.int64(1851),
 'callertune callers': np.int64(234),
 'callers press': np.int64(233),
 'press copy': np.int64(1659),
 'copy friends': np.int64(368),
 'friends callertune': np.int64(639),
 'winner value': np.int64(2405),
 'value network': np.int64(2283),
 'network customer': np.int64(1354),
 'prize reward': np.int64(1680

In [17]:
# bag of words but with ngrams - UNIGRAM AND BIGRAM and trigram
# create new bag of words with lemmatizer
# lemma_cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(1, 2))
lemma_cv = CountVectorizer(max_features=2500, binary=True, ngram_range=(3, 3)) # ignoring unigram
X = lemma_cv.fit_transform(lemma_corpus).toarray()
lemma_cv.vocabulary_

{'free entry wkly': np.int64(496),
 'may text fa': np.int64(998),
 'receive entry question': np.int64(1676),
 'question std txt': np.int64(1629),
 'std txt rate': np.int64(1990),
 'txt rate apply': np.int64(2180),
 'per request melle': np.int64(1439),
 'request melle melle': np.int64(1731),
 'melle melle oru': np.int64(1015),
 'melle oru minnaminunginte': np.int64(1016),
 'oru minnaminunginte nurungu': np.int64(1392),
 'minnaminunginte nurungu vettam': np.int64(1054),
 'nurungu vettam set': np.int64(1330),
 'vettam set callertune': np.int64(2335),
 'set callertune callers': np.int64(1899),
 'callertune callers press': np.int64(245),
 'callers press copy': np.int64(244),
 'press copy friends': np.int64(1585),
 'copy friends callertune': np.int64(353),
 'winner value network': np.int64(2440),
 'value network customer': np.int64(2327),
 'network customer select': np.int64(1240),
 'select receivea prize': np.int64(1844),
 'receivea prize reward': np.int64(1680),
 'prize reward claim': np.i