In [201]:
import numpy as np
import pandas as pd 

In [202]:
df = pd.read_csv('../input/sms-data-labelled-spam-and-non-spam/SMSSpamCollection',sep='\t',names=['label','text'])
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [203]:
#importing required libraries and initializing
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
ps = PorterStemmer()

In [204]:
#implementing the cleaning, stemming to the text
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]',' ',df['text'][i])    #removing the letters other than alphabets
    review = review.lower()                           #lowering the alphabets
    review = review.split()                           #splitting into words in a list
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]   #removing stopwords
    review = ' '.join(review)         #combine every word to a sentence (to combine list of words as a sentence)
    corpus.append(review)             #then add this sentence to the corpus list

### 1. Using PorterStemmer and Bag of Words

In [205]:
#Bag of words using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)     # max features is used to limit the no. of words/columns used in it 
X = cv.fit_transform(corpus).toarray()

In [206]:
y=pd.get_dummies(df['label'])
y=y.iloc[:,1].values

In [207]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [208]:
#naive baiyes model is best for NLP classification
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred=spam_detect_model.predict(X_test)

In [209]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9838565022421525

In [210]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       953
           1       0.95      0.94      0.94       162

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



### 2. Using PorterStemmer and TF-IDF

In [211]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfvec = TfidfVectorizer(max_features=2500)
X_tf = tfvec.fit_transform(corpus).toarray()
#same 'y' used before

In [212]:
from sklearn.model_selection import train_test_split
X_tf_train, X_tf_test, y_train, y_test = train_test_split(X_tf,y,test_size=0.2,random_state=0)

In [213]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_tf_train, y_train)
y_tf_pred=spam_detect_model.predict(X_tf_test)

In [214]:
print(accuracy_score(y_tf_pred,y_test))

0.979372197309417


In [215]:
print(classification_report(y_tf_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       978
           1       0.86      1.00      0.92       137

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115

