# Step 1 : Import Data

In [None]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

In [None]:
df=pd.read_csv('sms-spam.csv')
df.head()

Unnamed: 0,Label,SMSText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Step 2 : Data Cleaning and Preprocessing

In [None]:
# for cleaning data using regular expression
import re

In [None]:
#Natural Language Toolkit (NLTK) is text preprocessing library
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# here we will use lemmatizer instead of stemming
# stopwords are the unnecessary word required to be removed from the data
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
# creating lemmatizer object
lemmatizer = WordNetLemmatizer()

In [None]:
# creating an empty list
corpus = []

In [None]:
# cleaning out data
for i,j in df.iterrows():
    #1. removing all puntuation by taking only letters
    review = re.sub('[^a-zA-Z]', ' ', str(j['SMSText']))

    #2. converting all letters to lowercase
    review = review.lower()

    #3 split() method splits a string into a list removing spaces
    review = review.split()
    
    #4. lemmatizing it and removing stopwords and again joining with spaces
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Step 3 : NLP

In [None]:
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

# converting labels to dummies
y=pd.get_dummies(df['Label'])
y=y.iloc[:,1].values

In [None]:
# Train Test Split
X = X[:5574]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Step 4 : Modelling

In [None]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

y_pred=spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred, y_test))

0.9650224215246637
