In [2]:
import numpy as np
import pandas as pd

In [3]:
# Read tsv file into a dataframe object
# Press tab to check you are in the correct folder location and to browse
# to the tsv file
# The sep command indicates this files is separated by tabs
dataframe = pd.read_csv("SMSSpamCollection.tsv", sep="\t")

In [4]:
dataframe.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# Check for missing values
dataframe.isnull().sum()

label      0
message    0
dtype: int64

In [6]:
dataframe["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
# Following convention, X contains message data (large matrix)
# and y contains label data
X = dataframe["message"]
y = dataframe["label"]

In [8]:
# Contains index and message text
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [9]:
# Contains index and message label
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [10]:
from sklearn.model_selection import train_test_split

# test size represents the proportion of training and testing data split. 
# Random_state sets "randomness" of data randomisation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

In [11]:
# Import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
# Create an instance of count vectorizer
count_vectorizer = CountVectorizer()

In [14]:
# performs the fit and then transforms X_train
# into a numerical vector and stores in X_train_counts
X_train_counts = count_vectorizer.fit_transform(X_train)

In [15]:
# The matrix contains 3900 rows of text. These aare 70% of the original
# text messages (5572 rows)
X_train_counts

<3900x7155 sparse matrix of type '<class 'numpy.int64'>'
	with 51338 stored elements in Compressed Sparse Row format>

In [16]:
# Same as size of no of rows in X_train
X_train.shape

(3900,)

In [17]:
# Load the tfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer

# Create an instance of TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Perform a tf-idf fit transform on the X_train_counts
# sparse matrix. Put the result into X_train_tfidf
X_train_transform = tfidf_transformer.fit_transform(X_train_counts)

# Shape is the same as original count vectorizer
# although it now contains word term frequencies multiplied by the
# inverse document frequency
X_train_transform.shape

(3900, 7155)

## Alternatively

In [18]:
# Load the vectorizer library
from sklearn.feature_extraction.text import TfidfVectorizer
# Create an instance of the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# Complete the vectorizing and fit transform on the original X_train
# dataset
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# Examine shape of the dataset
X_train_tfidf.shape

(3900, 7155)

In [19]:
from sklearn.svm import LinearSVC

In [20]:
# Contents of X_test
X_test.head()

1078                         Yep, by the pretty sculpture
4028        Yes, princess. Are you going to make me moan?
958                            Welp apparently he retired
4642                                              Havent.
4674    I forgot 2 ask ü all smth.. There's a card on ...
Name: message, dtype: object

In [21]:
# Create an instance of the LinearSVC classifier
classifier = LinearSVC()

# X : {array-like, sparse matrix}
# y : array-like, shape = [n_samples], 
# Target vector relative to X
classifier.fit(X_train_tfidf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [22]:
# Transform original test message data to a vector
# No need to fit and transform it
X_test_transform = tfidf_vectorizer.transform(X_test)

# Predict message type from Linear SVC classifier
predictions = classifier.predict(X_test_transform)

In [23]:
# Predictions contains the predicted label data from inputted message test data
predictions.shape

(1672,)

In [24]:
# Show a confusion matrix of results
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))

[[1437    5]
 [  20  210]]


In [25]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1442
        spam       0.98      0.91      0.94       230

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.98      0.95      0.97      1672
weighted avg       0.98      0.99      0.98      1672



In [26]:
# Typical ham text message
sample_text_message = ["I'm going to go to work soon"]
transformed_text = tfidf_vectorizer.transform(sample_text_message)

model_output = classifier.predict(transformed_text)
print(model_output)

['ham']


In [27]:
# Typical spam text message
sample_text_message = ["You can win a holiday! Text 23455 to take this offer up."]
transformed_text = tfidf_vectorizer.transform(sample_text_message)

model_output = classifier.predict(transformed_text)
print(model_output)

['spam']


In [28]:
def predict_message_type(text_message):
    # Note that I'm using square brackets around the text_message variable
    transformed_message = tfidf_vectorizer.transform([text_message])
    
    # Predict model output from 
    model_output = classifier.predict(transformed_message)
    return(model_output)

In [29]:
# Typical spam message
predict_message_type("Your invoice is attached to this text. Click this link to download it.")

array(['spam'], dtype=object)

In [31]:
predict_message_type("Hi there. Hows things with you today? Are you heading out for some food?")

array(['ham'], dtype=object)

## Pipeline

In [33]:
from sklearn.pipeline import Pipeline

# If we didn't have these libraries loaded then we should
# also do this process
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_classifier = Pipeline([('tfidf_vect', TfidfVectorizer()),
                     ('LinearSVC_classifier', LinearSVC())])

# Feed the training data through the pipeline
text_classifier.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [34]:
# Form a prediction set
predictions = text_classifier.predict(X_test)

In [35]:
# Report the confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

[[1437    5]
 [  20  210]]


In [36]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1442
        spam       0.98      0.91      0.94       230

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.98      0.95      0.97      1672
weighted avg       0.98      0.99      0.98      1672



In [None]:
print(metrics.accuracy_score(y_test,predictions))