# Step 1 : Import Data

In [2]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving sms-spam.csv to sms-spam (1).csv


In [3]:
df=pd.read_csv('sms-spam.csv')
df.head()

Unnamed: 0,Label,SMSText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Step 2 : Data Cleaning and Preprocessing

In [4]:
# for cleaning data using regular expression
import re

In [5]:
#Natural Language Toolkit (NLTK) is text preprocessing library
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# here we will use lemmatizer instead of stemming
# stopwords are the unnecessary word required to be removed from the data
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [7]:
# creating lemmatizer object
lemmatizer = WordNetLemmatizer()

In [8]:
# creating an empty list
corpus = []

In [9]:
# cleaning out data
for i,j in df.iterrows():
    #1. removing all puntuation by taking only letters
    review = re.sub('[^a-zA-Z]', ' ', str(j['SMSText']))

    #2. converting all letters to lowercase
    review = review.lower()

    #3 split() method splits a string into a list removing spaces
    review = review.split()
    
    #4. lemmatizing it and removing stopwords and again joining with spaces
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Step 3 : NLP

In [10]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3250)
X = cv.fit_transform(corpus).toarray()

# converting labels to dummies
y=pd.get_dummies(df['Label'])
y=y.iloc[:,1].values

In [11]:
# Train Test Split
X = X[:5574]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Step 4 : Modelling

In [12]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred, y_test))

0.9775784753363229


In [16]:
# trying svm grid search
from sklearn import svm
from sklearn.model_selection import GridSearchCV
# passing tuning parameters
tuned_parameters = {'kernel':['linear', 'rbf'], 'gamma':[1e-3, 1e-4], 'C':[1,10,100,1000]}

model = GridSearchCV(svm.SVC(), tuned_parameters)

model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [15]:
y_pred=model.predict(X_test)
print(accuracy_score(y_pred, y_test))

0.8789237668161435
