# Spam Detection Using NLP

In [1]:
# importing the Dataset

import pandas as pd

messages = pd.read_csv('spam.csv',encoding='latin-1')

In [7]:
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
messages.dropna(axis=1,inplace=True)

In [22]:
messages.rename({'v1':"label", 'v2':"message"},inplace=True, axis=1)

In [23]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [54]:
#Data cleaning and preprocessing
import re
import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### Stemming

In [25]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [26]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [31]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [46]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
spam_detect_model = LogisticRegression().fit(X_train, y_train)
spam_detect_model2 = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)
y_pred2=spam_detect_model2.predict(X_test)



In [47]:
from sklearn import metrics 
# , auc, confusion_matrix

In [49]:
accuracy = metrics.f1_score(y_pred,y_test)
accuracy2 = metrics.f1_score(y_pred2,y_test)

In [53]:
(accuracy,accuracy2)

(0.922077922077922, 0.9544072948328267)

### Lemmatation

In [58]:
lemma = WordNetLemmatizer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

    
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values


# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
spam_detect_model = LogisticRegression().fit(X_train, y_train)
spam_detect_model2 = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)
y_pred2=spam_detect_model2.predict(X_test)



In [59]:
accuracy = metrics.f1_score(y_pred,y_test)
accuracy2 = metrics.f1_score(y_pred2,y_test)

In [60]:
(accuracy,accuracy2)

(0.922077922077922, 0.9486404833836858)