### Step 1: Load the data into the environment

In [1]:
import numpy as np
import pandas as pd
# Loading the data into the environment using pandas
# Note: Please use appropriate filename and path
sms_data = pd.read_csv("spam.csv", encoding='latin-1')
# Review the loaded data
print(sms_data.head())
cols = sms_data.columns[:2]
data = sms_data[cols]
print(data.shape)
data = data.rename(columns={"v1":"Value","v2":"Text"})
print(data.head())
print(data.Value.value_counts())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
(5572, 2)
  Value                                               Text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
ham     4825
spam     747
Name: Value, dt

### Step 2: Feature Engineering

In [7]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\91758\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
from string import punctuation
import re
import nltk
from nltk import word_tokenize
punctuation = list(punctuation)
# Creating a new feature called Punctuations. 
# This feature counts the number of punctuation characters in the sms message 
data["Punctuations"] = data["Text"].apply(lambda x: len(re.findall(r"[^\w+&&^\s]",x)))
# Creating a new feature called Phonenumbers. 
# This feature indicates if the sms text contains a phonenumber or not
data["Phonenumbers"] = data["Text"].apply(lambda x: len(re.findall(r"[0-9]{10}",x)))
# Creating a new feature called Links.
# This feature indicates if the sms text contains a URL or not 
is_link = lambda x: 1 if re.search(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+",x)!=None else 0
data["Links"] = data["Text"].apply(is_link)
# Creating a new feature called Uppercase.
# This feature indicates how many words in the the sms text are in upper case
count_upper = lambda x : list(map(str.isupper,x.split())).count(True) 
upper_case = lambda y,n : n+1 if y.isupper() else n
data["Uppercase"] = data["Text"].apply(count_upper)
# Identifying and counting how many unusual words are there in the sms text
def find_unusual_words(text):
    text_vocab_set = set(w.lower() for w in text if w.isalpha())
    english_vocab_set = set(w.lower() for w in nltk.corpus.words.words())
    unusual_set = text_vocab_set - english_vocab_set
    return len(sorted(unusual_set))
data["unusualwords"] = data["Text"].apply(lambda x: find_unusual_words(word_tokenize(x)))
# View a few records of the data after creating these features
print(data[14:25])

  


   Value                                               Text  Punctuations  \
14   ham                I HAVE A DATE ON SUNDAY WITH WILL!!             2   
15  spam  XXXMobileMovieClub: To use your credit, click ...            11   
16   ham                         Oh k...i'm watching here:)             6   
17   ham  Eh u remember how 2 spell his name... Yes i di...             5   
18   ham  Fine if thatÃ¥Ãs the way u feel. ThatÃ¥Ãs th...             5   
19  spam  England v Macedonia - dont miss the goals/team...             8   
20   ham          Is that seriously how you spell his name?             1   
21   ham  Iâ°ÃÃ·m going to try for 2 months ha ha only...             4   
22   ham  So Ã_ pay first lar... Then when is da stock ...             7   
23   ham  Aft i finish my lunch then i go str down lor. ...             3   
24   ham  Ffffffffff. Alright no way I can meet up with ...             2   

    Phonenumbers  Links  Uppercase  unusualwords  
14             0      0 

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf= TfidfVectorizer(stop_words="english",strip_accents='ascii',max_features=300)
tf_idf_matrix = tf_idf.fit_transform(data["Text"])

In [4]:
data_extra_features = pd.concat([data,pd.DataFrame(tf_idf_matrix.toarray(),columns=tf_idf.get_feature_names())],axis=1)

### Step 3: Machine Learning

In [7]:
from sklearn.model_selection import train_test_split
X=data_extra_features
features = X.columns.drop(["Value","Text"])
target = ["Value"]
X_train,X_test,y_train,y_test = train_test_split(X[features],X[target])

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(min_samples_split=40)
dt.fit(X_train,y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_train, dt.predict(X_train)))
print(accuracy_score(y_test, pred))

0.9822924144532185
0.9720028715003589


In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Building a Naive Bayes Model
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test, pred_mnb))
# Building a Logistic Regression Model
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred_lr = lr.predict(X_test)
print(accuracy_score(y_test, pred_lr))

  y = column_or_1d(y, warn=True)


0.9641062455132807
0.9777458722182341


  y = column_or_1d(y, warn=True)
