In [40]:
import os
import pandas as pd
import numpy as np
import nltk, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv(r"E:\NLP Project\SMS Spam Collection Dataset\spam.csv", encoding = 'ISO-8859-1')
df["v2"]

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [3]:
df.shape

(5572, 5)

In [4]:
df["v2"]

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

### Remove Punctuation

In [5]:
translator = str.maketrans("", "", string.punctuation)

In [6]:
df["v2"] = df["v2"].map(lambda text : text.translate(translator) if isinstance(text, str) else text)

In [7]:
df['v2']

0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in 2 a wkly comp to win FA Cup fina...
3             U dun say so early hor U c already then say
4       Nah I dont think he goes to usf he lives aroun...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                  Will Ì b going to esplanade fr home
5569    Pity  was in mood for that Soany other suggest...
5570    The guy did some bitching but I acted like id ...
5571                            Rofl Its true to its name
Name: v2, Length: 5572, dtype: object

### tokenixe string/create list of string including punctuation and make lowercase

### For example:
###    import nltk
### nltk.download('punkt')

### Define a piece of text
 text = "The quick brown fox jumps over the lazy  dog"

### Tokenize the text
 tokens = nltk.word_tokenize(text)

### Print the tokens
 print(tokens)
### Output: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [8]:
df["v2"] = df["v2"].map(lambda x : x.lower()).apply(lambda text : word_tokenize(text) if isinstance(text, str) else text)

In [9]:
df["v2"]

0       [go, until, jurong, point, crazy, available, o...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, in, 2, a, wkly, comp, to, win, f...
3       [u, dun, say, so, early, hor, u, c, already, t...
4       [nah, i, dont, think, he, goes, to, usf, he, l...
                              ...                        
5567    [this, is, the, 2nd, time, we, have, tried, 2,...
5568         [will, ì, b, going, to, esplanade, fr, home]
5569    [pity, was, in, mood, for, that, soany, other,...
5570    [the, guy, did, some, bitching, but, i, acted,...
5571                     [rofl, its, true, to, its, name]
Name: v2, Length: 5572, dtype: object

### Remove stop words

In [10]:
## creating stopword object
stop_words = set(stopwords.words("english"))

In [11]:
df["v2"] = df["v2"].map(lambda text : [word for word in text if word.lower() not in stop_words] if isinstance(text, list) else text)

In [12]:
df["v2"]

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
5567    [2nd, time, tried, 2, contact, u, u, å£750, po...
5568                   [ì, b, going, esplanade, fr, home]
5569                     [pity, mood, soany, suggestions]
5570    [guy, bitching, acted, like, id, interested, b...
5571                                   [rofl, true, name]
Name: v2, Length: 5572, dtype: object

In [13]:
### join tokenize words

### The Porter stemming algorithm is a widely used algorithm for stemming in natural language processing (NLP). The algorithm reduces words to their base or root form, by removing suffixes from the words. The nltk library in Python provides an implementation of the Porter stemming algorithm through the PorterStemmer class.

In [14]:
ps = PorterStemmer()

In [15]:
df["v2"] = df["v2"].apply(lambda text: " ".join([ps.stem(word) for word in text]))

In [16]:
df["v2"]

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
                              ...                        
5567    2nd time tri 2 contact u u å£750 pound prize 2...
5568                              ì b go esplanad fr home
5569                              piti mood soani suggest
5570    guy bitch act like id interest buy someth els ...
5571                                       rofl true name
Name: v2, Length: 5572, dtype: object

### TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a widely used technique in natural language processing (NLP) for converting textual data into numerical form.

The basic idea behind TF-IDF is to give weight to words that are more important in a given document and less important in the corpus as a whole. This is done by calculating two measures: term frequency (TF) and inverse document frequency (IDF).

In [17]:
tfidf = TfidfVectorizer()

In [18]:
x_data = tfidf.fit_transform(df["v2"].values).toarray()

In [19]:
x_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
y_data = df.v1

In [21]:
y_data

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [26]:
model = MultinomialNB(alpha = 1.0, fit_prior = True)

In [27]:
model.fit(x_train, y_train)

MultinomialNB()

In [28]:
y_pred = model.predict(x_test)

In [31]:
accuracy_score(y_pred, y_test)

0.9641255605381166

In [42]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1011
        spam       0.72      1.00      0.84       104

    accuracy                           0.96      1115
   macro avg       0.86      0.98      0.91      1115
weighted avg       0.97      0.96      0.97      1115



In [36]:
model_2 = GaussianNB()

In [37]:
model_2.fit(x_train, y_train)

GaussianNB()

In [38]:
y_pred_2 = model_2.predict(x_test)

In [39]:
accuracy_score(y_pred_2, y_test)

0.9040358744394619

In [41]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

         ham       0.98      0.91      0.94       971
        spam       0.58      0.90      0.71       144

    accuracy                           0.90      1115
   macro avg       0.78      0.90      0.82      1115
weighted avg       0.93      0.90      0.91      1115

