# 1. Importing the dataset

In [1]:
import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=["label", "message"])
messages.shape

(5572, 2)

In [2]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 2. Data cleaning and preprocessing

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guedri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
len(corpus)

5572

# 3. Creating the Bag of Words model

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [7]:
X.shape

(5572, 6296)

# 4. Training and validate models

Since we want to minimize the number of false positives (messages "ham" predicted "spam"), we will validate the model according to Precision metric

### 4.1. Train Test Split

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

### 4.2. Naive bayes Classifier

In [9]:
from sklearn.naive_bayes import MultinomialNB

clf_nb = MultinomialNB()
clf_nb.fit(X_train, y_train)

MultinomialNB()

In [10]:
y_pred = clf_nb.predict(X_test)

In [11]:
from sklearn.metrics import precision_score
precision_score(y_test, y_pred)

0.9082969432314411

### 4.3. Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

clf_decision_tree = DecisionTreeClassifier()
clf_decision_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [13]:
y_pred = clf_decision_tree.predict(X_test)
precision_score(y_test, y_pred)

0.9365853658536586

### 4.4. SVC

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf_svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf_svc.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [15]:
y_pred = clf_svc.predict(X_test)
precision_score(y_test, y_pred)

1.0

### 4.5. Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

clf_LR = LogisticRegression(random_state=0).fit(X_train, y_train)

In [17]:
y_pred = clf_LR.predict(X_test)
precision_score(y_test, y_pred)

0.9847715736040609

### 4.6. Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

clf_RF = RandomForestClassifier(random_state=0).fit(X_train, y_train)

In [19]:
y_pred = clf_RF.predict(X_test)
precision_score(y_test, y_pred)

0.9946808510638298

### 4.7. KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

clf_knn = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)

In [21]:
y_pred = clf_knn.predict(X_test)
precision_score(y_test, y_pred)

1.0

# 5. Save the model

I choose SVC model

In [22]:
import pickle 
model_svc_pickle = open('./model_svc', 'wb')
pickle.dump(clf_svc, model_svc_pickle)