In [16]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import nltk as n
from sklearn.feature_extraction.text import CountVectorizer
import shutil
from nltk.tokenize import word_tokenize
import re
import string
from sklearn.metrics import classification_report, confusion_matrix

necessary imports

In [17]:
spam_data = pd.read_csv('/content/spam.csv',encoding='latin-1')
spam_data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


imported the spam spreadsheet and encoded it in latin-1 and checked if it was imported correctly

In [18]:
print(spam_data.isna().sum())

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


checked for how many null values

In [19]:
spam_data = spam_data.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])
spam_data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


droped the last three columns

In [20]:
n.download('stopwords')
stopword = n.corpus.stopwords.words('english')
print(stopword)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


added stopwords so they can be removed when i clean text

In [21]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r'\d+', '',text)
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = text.strip()
  return text

made a function that cleans each word in the v2 column were each row in that column is lowercase, removes puncuation, removes whitespace before and after, and removes numbers

In [22]:
spam_data['clean_spam'] = spam_data['v2'].apply(clean_text)
spam_data.head()

Unnamed: 0,v1,v2,clean_spam
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


makes a new column for the cleaned v2 row that has the clean_text function applied to it using the apply function with the clean_text function inside

In [23]:
spam_data['numeric_spam'] = pd.get_dummies(spam_data['v1'], drop_first = True, dtype=int)
spam_data.head()

Unnamed: 0,v1,v2,clean_spam,numeric_spam
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...,1
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,0


makes another new column with v1 values being represented by integers

In [24]:
vectorizer = CountVectorizer(stop_words=stopword)
X = vectorizer.fit_transform(spam_data['clean_spam'])
Y = spam_data['numeric_spam']

makes a vectorizer with all the stopwords and then applies the vectorizer to the cleaned spam_data row to remove stop words and then puts the numeric spam into the Y variable

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.8, random_state = 1)


splits the data into training and testing data with the test size being 0.8

In [26]:
bayes = MultinomialNB()
bayes = bayes.fit(X_train, Y_train)
bayes_pred = bayes.predict(X_test)
print(classification_report(Y_test, bayes_pred))
print(confusion_matrix(Y_test, bayes_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3852
           1       0.91      0.85      0.88       606

    accuracy                           0.97      4458
   macro avg       0.94      0.92      0.93      4458
weighted avg       0.97      0.97      0.97      4458

[[3802   50]
 [  88  518]]


trains a multinomial native bayes model and prints a classification report and a confusion matrix

In [27]:
SVM = LinearSVC()
SVM = SVM.fit(X_train, Y_train)
SVM_pred = SVM.predict(X_test)
print(classification_report(Y_test, SVM_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      3852
           1       0.98      0.75      0.85       606

    accuracy                           0.96      4458
   macro avg       0.97      0.87      0.92      4458
weighted avg       0.97      0.96      0.96      4458



trains a linear support vector machine and prints a classification report for it

In [28]:
print(confusion_matrix(Y_test,SVM_pred))

[[3845    7]
 [ 151  455]]


prints a confusion martrid for the support vector machine model