In [1]:
import numpy as np
import pandas as pd
import nltk

In [None]:
nltk.download_shell()

nltk.download('punkt')

In [2]:
spam=pd.read_csv(r"Datasets\spam.csv", encoding='latin-1')
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
drop_cols=spam.columns[2:]
spam.drop(columns=drop_cols,inplace=True)

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem import PorterStemmer

In [5]:
def tokenize_and_remove_stopwords(ds, remove_stopwords=True):
    if remove_stopwords:
        return ds.apply(lambda msg: [word for word in word_tokenize(msg) if word not in stopwords.words('english')])
    else:
        return ds.apply(word_tokenize)

def stem_msgs(ds):
    stemmer=PorterStemmer()
    return ds.apply(lambda x: [stemmer.stem(word) for word in x])

def rejoin_tokenized_msgs(ds, sep=' '):
    return ds.apply(lambda tokens: sep.join(tokens))

def preproc_pipeline(ds, remove_stopwords=True, sep=' '):
    tokenized=tokenize_and_remove_stopwords(ds, remove_stopwords=remove_stopwords)
    stemmed=stem_msgs(tokenized)
    rejoined=rejoin_tokenized_msgs(stemmed, sep=sep)
    return rejoined

spam['PreprocessedMsgs']=preproc_pipeline(spam.v2)

In [6]:
spam.head()

Unnamed: 0,v1,v2,PreprocessedMsgs
0,ham,"Go until jurong point, crazy.. Available only ...","go jurong point , crazi .. avail bugi n great ..."
1,ham,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor ... u c alreadi say ...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i n't think goe usf , live around though"


In [7]:
def convert_feature(text):
    if text=='ham':
        return 0
    elif text=='spam':
        return 1
    else:
        return None

In [8]:
y=np.array(spam.v1.apply(convert_feature))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv=CountVectorizer()
vectorized_msgs=cv.fit_transform(spam.PreprocessedMsgs)

In [11]:
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [12]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_msgs, y, test_size=0.2)

rfc= RandomForestClassifier()
rfc.fit(X_train,y_train)

y_pred= rfc.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[941   0]
 [ 27 147]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       941
           1       1.00      0.84      0.92       174

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.97      1115



In [13]:
spam_val=pd.read_csv(r"Datasets\spam_validation.csv")
spam_val.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [18]:
X_val=cv.transform(preproc_pipeline(spam_val.text))
y_val=spam_val.label.apply(convert_feature)

y_val_pred=rfc.predict(X_val)

print(confusion_matrix(y_val,y_val_pred))
print(classification_report(y_val,y_val_pred))

[[2799  873]
 [ 941  558]]
              precision    recall  f1-score   support

           0       0.75      0.76      0.76      3672
           1       0.39      0.37      0.38      1499

    accuracy                           0.65      5171
   macro avg       0.57      0.57      0.57      5171
weighted avg       0.64      0.65      0.65      5171



In [26]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

nbc=GaussianNB()
nbc.fit(X_train.toarray(),y_train)
y_pred=nbc.predict(X_test.toarray())

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

log_reg=LogisticRegression()
log_reg.fit(X_train,y_train)
y_pred=log_reg.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[837 104]
 [ 15 159]]
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       941
           1       0.60      0.91      0.73       174

    accuracy                           0.89      1115
   macro avg       0.79      0.90      0.83      1115
weighted avg       0.92      0.89      0.90      1115

[[941   0]
 [ 19 155]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       941
           1       1.00      0.89      0.94       174

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [28]:
y_val_pred=nbc.predict(X_val.toarray())

print(confusion_matrix(y_val,y_val_pred))
print(classification_report(y_val,y_val_pred))

y_val_pred=log_reg.predict(X_val)

print(confusion_matrix(y_val,y_val_pred))
print(classification_report(y_val,y_val_pred))

[[2459 1213]
 [1135  364]]
              precision    recall  f1-score   support

           0       0.68      0.67      0.68      3672
           1       0.23      0.24      0.24      1499

    accuracy                           0.55      5171
   macro avg       0.46      0.46      0.46      5171
weighted avg       0.55      0.55      0.55      5171

[[2085 1587]
 [ 898  601]]
              precision    recall  f1-score   support

           0       0.70      0.57      0.63      3672
           1       0.27      0.40      0.33      1499

    accuracy                           0.52      5171
   macro avg       0.49      0.48      0.48      5171
weighted avg       0.58      0.52      0.54      5171

