In [1]:
# core libraries
import pandas as pd
import numpy as np

In [45]:
# extra libraries
import spacy
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

<br>
<br>
<br>

### Data Collection

In [3]:
# loading csv data
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# remove duplicate values
df.drop_duplicates(inplace=True)
df.shape

(5157, 2)

<br>
<br>
<br>

### Data Preparation

In [5]:
# check structure
X = df.Message
y = df.Category
print(X.shape)
print(y.shape)

(5157,)
(5157,)


In [6]:
# create training and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

In [7]:
# check structure
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4125,)
(1032,)
(4125,)
(1032,)


<br>

### Data Preprocessing

#### text preprocessing

In [9]:
# load large english vocabulary
nlp = spacy.load("en_core_web_lg")

In [10]:
# remove junk from raw text
def remove_junk(doc) -> str:
    cleaned = ""
    for token in doc:
        if token.is_space or token.is_punct or token.is_stop:
            pass
        else:
            cleaned += ' '
            cleaned += token.lemma_
    return cleaned

In [11]:
# cleaning x_train
x_train_cleaned = x_train.apply(lambda x: remove_junk(nlp(x)))
x_train_cleaned.head()

5267                     lar ü go home 4 dinner
1381                          dnt wnt tlk wid u
1543                        press conference da
651      s cool slow gentle sonetime rough hard
177                                   u go mall
Name: Message, dtype: object

In [12]:
# cleaning x_test
x_test_cleaned = x_test.apply(lambda x: remove_junk(nlp(x)))
x_test_cleaned.head()

846      shit suite xavier decide lt;#&gt second warn ...
909                              WHITE FUDGE OREOS STORES
2101         oh Howda gud gud Mathe en samachara chikku:-
2890                                     battery low babe
4013                                    discuss mother ah
Name: Message, dtype: object

<br>

In [13]:
# encoding y_train
y_train_cleaned = y_train.apply(lambda x: 1 if x == 'spam' else 0)
y_train_cleaned.head()

5267    0
1381    0
1543    0
651     0
177     0
Name: Category, dtype: int64

In [14]:
# encoding y_test
y_test_cleaned = y_test.apply(lambda x: 1 if x == 'spam' else 0)
y_test_cleaned.head()

846     0
909     0
2101    0
2890    0
4013    0
Name: Category, dtype: int64

<br>
<br>
<br>

#### word embedding

In [16]:
# word to vector
tfidf_vect = TfidfVectorizer(max_features=3000)

In [17]:
# vectorizing x_train
x_train_matrix = tfidf_vect.fit_transform(x_train_cleaned)
x_train_matrix.shape

(4125, 3000)

In [19]:
# vectorizing x_test
x_test_matrix = tfidf_vect.transform(x_test_cleaned)
x_test_matrix.shape

(1032, 3000)

<br>
<br>
<br>

### Modelling

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

<br>

#### logistic regression

In [21]:
# classifier
logistic_regression = LogisticRegression()

In [22]:
# cross validation score
scores = cross_val_score(LogisticRegression(), x_train_matrix, y_train_cleaned, scoring='accuracy', cv=10, n_jobs=-1)
print(scores)
scores.mean()

[0.95399516 0.95399516 0.95157385 0.94915254 0.95641646 0.96359223
 0.95873786 0.96116505 0.94660194 0.95873786]


0.9553968123369143

In [23]:
# training the model
logistic_regression.fit(x_train_matrix, y_train_cleaned)

In [24]:
# predictions on train set
predictions = logistic_regression.predict(x_train_matrix)
accuracy_score(y_train_cleaned, predictions)

0.9706666666666667

In [25]:
# predictions on test set
predictions = logistic_regression.predict(x_test_matrix)
accuracy_score(y_test_cleaned, predictions)

0.9660852713178295

<br>

#### random forest

In [40]:
# classifier
forest = RandomForestClassifier(max_depth=27)

In [44]:
# cross validation score
scores = cross_val_score(RandomForestClassifier(max_depth=27), x_train_matrix, y_train_cleaned, scoring='accuracy', cv=10, n_jobs=-1)
print(scores)
scores.mean()

[0.97094431 0.95157385 0.96125908 0.95641646 0.95883777 0.97572816
 0.96601942 0.97815534 0.9538835  0.97815534]


0.9650973224570395

In [41]:
# training the model
forest.fit(x_train_matrix, y_train_cleaned)

In [42]:
# predictions on train set
predictions = forest.predict(x_train_matrix)
accuracy_score(y_train_cleaned, predictions)

0.9798787878787879

In [43]:
# predictions on test set
predictions = forest.predict(x_test_matrix)
accuracy_score(y_test_cleaned, predictions)

0.9651162790697675

<br>
<br>
<br>

### Saving Models

In [49]:
# saving word vectorizer
joblib.dump(tfidf_vect, '../models/tfidf_vectorizer.pkl')

['../models/tfidf_vectorizer.pkl']

In [47]:
# saving logistic regression model
joblib.dump(logistic_regression, '../models/logistic_regression.pkl')

['../models/logistic_regression.pkl']

In [48]:
# saving ransom forest model
joblib.dump(forest, '../models/random_forest.pkl')

['../models/random_forest.pkl']