In [1]:
# core libraries
import pandas as pd
import numpy as np

In [2]:
# extra libraries
import spacy
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

<br>
<br>
<br>

### Data Collection

In [3]:
# loading csv data
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# remove duplicate values
df.drop_duplicates(inplace=True)
df.shape

(5157, 2)

<br>
<br>
<br>

### Data Preparation

In [5]:
# check structure
X = df.Message
y = df.Category
print(X.shape)
print(y.shape)

(5157,)
(5157,)


In [6]:
# create training and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [7]:
# check structure
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4125,)
(1032,)
(4125,)
(1032,)


<br>

### Data Preprocessing

#### text preprocessing

In [9]:
# load large english vocabulary
nlp = spacy.load("en_core_web_sm")

In [10]:
# remove junk from raw text
def remove_junk(doc) -> str:
    cleaned = ""
    for token in doc:
        if token.is_space or token.is_punct or token.is_stop:
            pass
        else:
            cleaned += ' '
            cleaned += token.lemma_
    return cleaned

In [11]:
# cleaning x_train
x_train_cleaned = x_train.apply(lambda x: remove_junk(nlp(x)))
x_train_cleaned.head()

4086     Orange bring ringtone time Chart Heroes free ...
3501     Dorothy@kiefer.com Bank Granite issue Strong ...
1367                       Bbq sit 6ish ur welcome 2 come
924                go attend round today did't reach home
5259         help u swoop pick u ur bird r meeting u want
Name: Message, dtype: object

In [12]:
# cleaning x_test
x_test_cleaned = x_test.apply(lambda x: remove_junk(nlp(x)))
x_test_cleaned.head()

3830                                        sure come bit
3129     u fuckin believe not know thur pre book cance...
4385     thank honey hear leave bit long 2 crowd try l...
1728                                    go project centre
2952                                             hey free
Name: Message, dtype: object

<br>

In [13]:
# encoding y_train
y_train_cleaned = y_train.apply(lambda x: 1 if x == 'spam' else 0)
y_train_cleaned.head()

4086    1
3501    1
1367    0
924     0
5259    0
Name: Category, dtype: int64

In [14]:
# encoding y_test
y_test_cleaned = y_test.apply(lambda x: 1 if x == 'spam' else 0)
y_test_cleaned.head()

3830    0
3129    0
4385    0
1728    0
2952    0
Name: Category, dtype: int64

<br>
<br>
<br>

#### word embedding

In [15]:
# word to vector
tfidf_vect = TfidfVectorizer(max_features=3000)

In [16]:
# vectorizing x_train
x_train_matrix = tfidf_vect.fit_transform(x_train_cleaned)
x_train_matrix.shape

(4125, 3000)

In [17]:
# vectorizing x_test
x_test_matrix = tfidf_vect.transform(x_test_cleaned)
x_test_matrix.shape

(1032, 3000)

<br>
<br>
<br>

### Modelling

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

<br>

#### logistic regression

In [19]:
# classifier
logistic_regression = LogisticRegression()

In [20]:
# cross validation score
scores = cross_val_score(LogisticRegression(), x_train_matrix, y_train_cleaned, scoring='accuracy', cv=10, n_jobs=-1)
print(scores)
scores.mean()

[0.95883777 0.96125908 0.96125908 0.96125908 0.96368039 0.9538835
 0.9684466  0.95873786 0.94660194 0.9538835 ]


0.958784879757399

In [21]:
# training the model
logistic_regression.fit(x_train_matrix, y_train_cleaned)

In [22]:
# predictions on train set
predictions = logistic_regression.predict(x_train_matrix)
accuracy_score(y_train_cleaned, predictions)

0.9726060606060606

In [23]:
# predictions on test set
predictions = logistic_regression.predict(x_test_matrix)
accuracy_score(y_test_cleaned, predictions)

0.9631782945736435

<br>

#### random forest

In [24]:
# classifier
forest = RandomForestClassifier(max_depth=27)

In [25]:
# cross validation score
scores = cross_val_score(RandomForestClassifier(max_depth=27), x_train_matrix, y_train_cleaned, scoring='accuracy', cv=10, n_jobs=-1)
print(scores)
scores.mean()

[0.96610169 0.97094431 0.96125908 0.97578692 0.96368039 0.97087379
 0.97815534 0.97087379 0.95145631 0.95631068]


0.9665442300007052

In [26]:
# training the model
forest.fit(x_train_matrix, y_train_cleaned)

In [27]:
# predictions on train set
predictions = forest.predict(x_train_matrix)
accuracy_score(y_train_cleaned, predictions)

0.9801212121212122

In [28]:
# predictions on test set
predictions = forest.predict(x_test_matrix)
accuracy_score(y_test_cleaned, predictions)

0.9612403100775194

<br>
<br>
<br>

### Saving Models

In [29]:
# saving word vectorizer
joblib.dump(tfidf_vect, '../models/tfidf_vectorizer.pkl')

['../models/tfidf_vectorizer.pkl']

In [30]:
# saving logistic regression model
joblib.dump(logistic_regression, '../models/logistic_regression.pkl')

['../models/logistic_regression.pkl']

In [31]:
# saving ransom forest model
joblib.dump(forest, '../models/random_forest.pkl')

['../models/random_forest.pkl']