In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [24]:
data = pd.read_csv('data/sms.tsv', delimiter='\t', names=['Label', 'Message'])

print('=============Splitting the data=============')
X = data.Message
y = data.Label
print(f'Data shape: {data.shape}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'X_Train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test.shape}, y_test shape: {y_test.shape}')

print('\n============Message Preprocessing============')
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

print('Training and testing data shape after pre-processing:')
print(f'X_Train shape: {X_train_vect.shape}, y_train shape: {y_train.shape}')
print(f'X_Test shape: {X_test_vect.shape}, y_test shape: {y_test.shape}')

print('\n=============Model Building==================')
print('BEGIN. Logistic Regression......')
lr_model = LogisticRegression()
lr_model.fit(X_train_vect, y_train)
lr_y_pred = lr_model.predict(X_test_vect)
print('END. Logistic Regression\n')

print('BEGIN. Support Vector Machine.........')
svm_model = LinearSVC()
svm_model.fit(X_train_vect, y_train)
svm_y_pred = svm_model.predict(X_test_vect)
print('END. Support Vector Machine\n')

print('BEGIN. Naive Bayes..........')
nb_model = MultinomialNB()
nb_model.fit(X_train_vect, y_train)
nb_y_pred = nb_model.predict(X_test_vect)
print('END. Naive Bayes\n')

Data shape: (5572, 2)
X_Train shape: (3900,), y_train shape: (3900,)
X_Test shape: (1672,), y_test shape: (1672,)

Training and testing data shape after pre-processing:
X_Train shape: (3900, 7263), y_train shape: (3900,)
X_Test shape: (1672, 7263), y_test shape: (1672,)

BEGIN. Logistic Regression......
END. Logistic Regression

BEGIN. Support Vector Machine.........
END. Support Vector Machine

BEGIN. Naive Bayes..........
END. Naive Bayes



In [17]:
X_train

708     Quite late lar... Ard 12 anyway i wun b drivin...
4338                        on a Tuesday night r u 4 real
5029    Go chase after her and run her over while she'...
4921     G says you never answer your texts, confirm/deny
2592         Still work going on:)it is very small house.
                              ...                        
3772    Hi, wlcome back, did wonder if you got eaten b...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860               Did he just say somebody is named tampa
Name: Message, Length: 3900, dtype: object

In [8]:
y.shape

(5572,)