In [42]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, classification_report

def train_model(model_name, model, X_train, X_test, y_train, y_test):
    print(f'BEGIN. {model_name.upper()}......')
    model.fit(X_train_vect, y_train)
    y_pred = model.predict(X_test_vect)
    print(f'{model_name.upper()}: \t\t{accuracy_score(y_test, y_pred) * 100:.2f}%')
    print(classification_report(y_test, y_pred))
    print(f'END. {model_name.upper()}')
    print('======================================================')
    return y_pred

data = pd.read_csv('data/sms.tsv', delimiter='\t', names=['Label', 'Message'])

print('=============Splitting the data=============')
X = data.Message
y = data.Label
print(f'Data shape: {data.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'X_Train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test.shape}, y_test shape: {y_test.shape}')

print('\n============Message Preprocessing============')
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

print('Training and testing data shape after pre-processing:')
print(f'X_Train shape: {X_train_vect.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test_vect.shape}, y_test shape: {y_test.shape}')

print('\n=============Model Building==================')
lr_model = LogisticRegression()
lr_y_pred = train_model('Logistic Regression', lr_model, X_train, X_test, y_train, y_test)

svm_model = LinearSVC()
svm_y_pred = train_model('Support Vector Machine', svm_model, X_train, X_test, y_train, y_test)

nb_model = MultinomialNB()
nb_y_pred = train_model('Naive Bayes', nb_model, X_train, X_test, y_train, y_test)

Data shape: (5572, 2)
X_Train shape: (3900,), y_train shape: (3900,)
X_Test shape: (1672,), y_test shape: (1672,)

Training and testing data shape after pre-processing:
X_Train shape: (3900, 7263), y_train shape: (3900,)
X_Test shape: (1672, 7263), y_test shape: (1672,)

BEGIN. LOGISTIC REGRESSION......
LOGISTIC REGRESSION: 		97.13%
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.99      0.79      0.88       224

    accuracy                           0.97      1672
   macro avg       0.98      0.90      0.93      1672
weighted avg       0.97      0.97      0.97      1672

END. LOGISTIC REGRESSION
BEGIN. SUPPORT VECTOR MACHINE......
SUPPORT VECTOR MACHINE: 		99.22%
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      1448
        spam       0.99      0.96      0.97       224

    accuracy                           0.99      1672
   macro avg       0.99   