In [1]:
#importin the required libraries required for the classification
import pandas as pd
import re
import nltk
import numpy as np
from nltk.corpus import stopwords, LazyCorpusLoader
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
#reading the csv file into a dataframe
df = pd.read_csv('D:\data science\spam.csv', encoding = 'cp1252')

In [3]:
#X is the messages and y is the label to X, basically the target variable
X = df['v2']
y = df['v1']
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

Now that we have split the data into messages and target variables, we have to convert them into an appropriate formatto make our model interpret it 

In [4]:
lemmatizer = WordNetLemmatizer()                   #creating the lemmatizer object
stop_words = stopwords.words('English')            #creating stop words object to remove the stop words

#cleaning the data and lemmatizing the words
for i in range(len(X)):
    words = re.sub('[^A-Za-z0-9]', ' ', X[i])
    words = words.lower()
    words = nltk.word_tokenize(words)
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stop_words)]
    X[i] = ' '.join(words)  

In [5]:
#converting the lemmatized vectors into TF-IDF format and converting the target varivable to numerical form
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X).toarray()
y = pd.get_dummies(y).iloc[:,1]

In [6]:
#splitting the data into training and testing parts so as to fit it into our machine learning model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [7]:
naive_bayes = MultinomialNB()                        #creating a Naive bayes object
naive_bayes.fit(X_train, y_train)                    #fitting the model
y_predict = naive_bayes.predict(X_test)              #prediciting the test data 
naive_bayes.score(X_test, y_test)

0.9665071770334929

In [8]:
#using the confusion matrix in order to evaluate the model
confusion_matrix(y_predict, y_test)

array([[1452,   55],
       [   1,  164]], dtype=int64)

Thus, we can see from the confusion matrix that 56 festures have been classified wrongly out of 1672 data points

In [9]:
naive_bayes = MultinomialNB()                        #creating a Naive bayes object
cross_valid_NB = cross_val_score(naive_bayes, X, y, cv = 10)
print(np.mean(cross_valid_NB))

0.9727189951287942
