In [120]:
# Imports 

import pandas as pd
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk import word_tokenize,pos_tag
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [121]:
# Import data
df = pd.read_csv('Datasets/spam.csv', encoding='Latin')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [122]:
df.drop(df.columns[[2,3,4]], axis=1, inplace=True)
df.columns = ['target', 'message']
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [123]:
# Check for missing values

df.isnull().sum()

target     0
message    0
dtype: int64

In [124]:
# Check for duplicates

df.duplicated().sum()

403

In [125]:
# Drop duplicates

df = df.drop_duplicates(keep= 'first')
df.duplicated().sum()

0

In [126]:
df.shape

(5169, 2)

In [127]:
# Preprocessing

stopwords_en = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    
    lst = []
    for w in text:
        if w.isalnum():
            lst.append(w)
    
    words = lst[:]
    lst.clear()
    
    for w in text:
        if w not in stopwords_en:
            lst.append(w)
            
    words = lst[:]
    lst.clear()
    
    for x in text:
        lst.append(ps.stem(x))
        
    return ' '.join(lst)
    

In [128]:
# test the function
preprocess('I would like to see if my function is working.')

'i would like to see if my function is work .'

In [129]:
# process the text data

df['processed_text'] = df.message.apply(preprocess)

In [130]:
df.head()

Unnamed: 0,target,message,processed_text
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point , crazi .. avail onli in..."
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri in 2 a wkli comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so earli hor ... u c alreadi then sa...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i do n't think he goe to usf , he live aro..."


In [209]:
# Train and test the model

tfidf = TfidfVectorizer(max_features=4000)

In [210]:
X = tfidf.fit_transform(df.processed_text).toarray()

In [211]:
X.shape

(5169, 4000)

In [200]:
y = df.target.values

In [201]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [202]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [203]:
# Gaussian 

gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred1))

0.8878143133462283


In [204]:
mnb.fit(X_train, y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred2))

0.9593810444874274


In [205]:
bnb.fit(X_train, y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test, y_pred3))

0.9845261121856866
