## SMS Spam Classification

In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'message'])  #\t --> because in data-set values are separated with tabs
                                                                                                     #giving column names as a 'label' and 'message'

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#cleaning the messages using data-preprocessing techniques
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
wordNet = WordNetLemmatizer()
corpus = []

In [5]:
#converting messages into base root form using stemming
for i in range(0, len(messages)):
    temp = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    temp = temp.lower()
    temp = temp.split()
    temp = [wordNet.lemmatize(word) for word in temp if not word in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

In [6]:
#Term-Frequency and Inverse Document Frequency using Scikit-Learn
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
#Independent Data
X = cv.fit_transform(corpus).toarray()

In [20]:
#Dimesion of the independent features
X.shape

(5572, 7098)

In [8]:
#dependent data
Y = pd.get_dummies(messages['label'])
Y = Y.iloc[:,1].values

In [9]:
Y.shape

(5572,)

In [10]:
#creating model using scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [18]:
X_test.shape

(1115, 7098)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, Y_train)

In [13]:
prediction = model.predict(X_test)

In [14]:
prediction

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [15]:
#confusion metrics from comparing results
from sklearn.metrics import confusion_matrix
conf_m = confusion_matrix(Y_test, prediction)

In [16]:
conf_m

array([[955,   0],
       [ 31, 129]], dtype=int64)

In [17]:
#accuracy score
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, prediction)
print("Accuracy : ",acc)

Accuracy :  0.9721973094170404
