In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

os.listdir("../Spam Classifier /smsspamcollection")

['SMSSpamCollection', '.ipynb_checkpoints']

### Link for dataset : https://archive.ics.uci.edu/ml/machine-learning-databases/00228/

In [3]:
# Dataset is taken from UCI Machine Learning Repository.
data = pd.read_table("../Spam Classifier /smsspamcollection/SMSSpamCollection",header=None)

data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.columns = ["label", "message"]
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
label_dict = {
    "ham" : 0,
    "spam" : 1
}

data.label = data.label.map(label_dict)
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


<hr>

## Generate BERT Embeddings

<hr>

<hr>

#### Installation command of BERT Server and client:

pip install -U bert-serving-server bert-serving-client

#### Command to start BERT server:

bert-serving-start -model_dir="Path to BERT Directory" -num_worker=1

<hr>

In [9]:
from bert_serving.client import BertClient

bc = BertClient(check_version=False)

In [10]:
vectors_dict = dict()

terms = data["message"].unique()

vectors = bc.encode(list(terms))

for t, v in zip(terms, vectors):
    vectors_dict[t] = v

here is what you can do:
- or, start a new server with a larger "max_seq_len"


In [11]:
data["embeddings"] = data["message"].map(vectors_dict)
data.head()

Unnamed: 0,label,message,embeddings
0,0,"Go until jurong point, crazy.. Available only ...","[0.010752785, -0.77529496, 0.5881701, 0.091800..."
1,0,Ok lar... Joking wif u oni...,"[0.15376072, -0.41323993, 0.48208877, -0.69190..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[-0.5485422, -0.44419515, 0.84415144, -0.26627..."
3,0,U dun say so early hor... U c already then say...,"[-0.22559564, 0.113234006, 0.8456758, -0.59861..."
4,0,"Nah I don't think he goes to usf, he lives aro...","[0.28874722, 0.44699675, -0.12181223, -0.08935..."


<hr>

## Splitting data

<hr>

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data["embeddings"].tolist(), data["label"], random_state=0, test_size=0.2)

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, precision_recall_fscore_support, classification_report

print("Accuracy Score : ", accuracy_score(y_test, y_pred))
print("Precision Score : ", precision_score(y_test, y_pred))
print("Precision, Recall, Fscore, Support : \n", precision_recall_fscore_support(y_test, y_pred))
print("Classification Report : \n", classification_report(y_test, y_pred))
print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))

Accuracy Score :  0.9883408071748879
Precision Score :  0.9622641509433962
Precision, Recall, Fscore, Support : 
 (array([0.99267782, 0.96226415]), array([0.99371728, 0.95625   ]), array([0.99319728, 0.95924765]), array([955, 160]))
Classification Report : 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.96      0.96      0.96       160

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix : 
 [[949   6]
 [  7 153]]
