## HAM and SPAM text message classification model

In [21]:
import spacy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
 sms_df = pd.read_csv("../sample_files/smsspamcollection.tsv",sep="\t")

In [4]:
sms_df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [5]:
# checking is there any null values in the data
sms_df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [6]:
len(sms_df)

5572

In [7]:
# number of ham and spam messages
a = sms_df[sms_df["label"] == "spam"]
print(len(a)) 

747


In [8]:
sms_df["label"].unique()

array(['ham', 'spam'], dtype=object)

In [9]:
sms_df["label"].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [10]:
# For now we are trying to build a simple matchine learning model that tries to 
# check if a message is ham or spam based on the length and punchuation. Later we use text message.

In [11]:
X = sms_df[["length","punct"]]
Y = sms_df["label"]
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.3, random_state=42)

In [12]:
print(x_train.shape)

(3900, 2)


In [13]:
lr_model = LogisticRegression()

In [14]:
lr_model.fit(x_train,y_train)

LogisticRegression()

In [15]:
# checking the accuracy of the model
prediction = lr_model.predict(x_test)

In [19]:
prediction

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [23]:
# now checking this prediction with y_test to check the accuracy
metrics.confusion_matrix(y_test, prediction)

array([[1404,   44],
       [ 219,    5]], dtype=int64)

In [26]:
print(metrics.classification_report(y_test, prediction))

              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

    accuracy                           0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [29]:
# printing accuracy of the prediction
print(metrics.accuracy_score(y_test, prediction))

0.8427033492822966


In [30]:
# you can also learn different models to check if their accuracy score is better

In [31]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(x_train, y_train)
nb_prediction = nb_model.predict(x_test)

In [33]:
print(metrics.accuracy_score(y_test, nb_prediction))

0.8600478468899522


In [35]:
print(metrics.confusion_matrix(y_test,nb_prediction))

[[1438   10]
 [ 224    0]]


## Text feature extration
- to use message text in the matchine learning model

- Matchine learning models can't take text in algorithm. so we do feature extraction on that text
- some are - Count vectorization or Term-frequency, Inverse Document frequency
### Term Frequency and Inverse Document frequency
- raw count of a term in a Document
- term frequency isn't enough because it gives priority to most common occuring words
- so we use inverse document frequency there
- obtained by :- diving total number of documnet by document containing the word and then log the quotient
- tf_idf allow us to understand the context of word accross the entire set of documents

In [41]:
X = sms_df["message"]
y = sms_df["label"]
x_train,x_test, y_train, y_test = train_test_split(X,y,random_state=42, test_size=0.3)

In [43]:
# vectorising text messages
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
count_vect = CountVectorizer()

In [46]:
# step1 - fit the data to vectorizer
# step2 - transform the data to vector
# there is one method to perform both steps
x_train_vect = count_vect.fit_transform(x_train)


In [48]:
# giving the words which have less frequency more weight with if_idf

In [49]:
from sklearn.feature_extraction.text import TfidfTransformer

In [50]:
tf_idf = TfidfTransformer()
x_train_tf_idf = tf_idf.fit_transform(x_train_vect)

In [52]:
# here we have done these two steps to get vector and then tfidf
# scikit learn provides us one fuction that does both these steps
# tfidf vectorisation
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
tf_idf_vect = TfidfVectorizer()
x_train_tfidf_vect = tf_idf_vect.fit_transform(x_train)

In [54]:
from sklearn.svm import LinearSVC

In [55]:
lsvc = LinearSVC()
lsvc.fit(x_train_tfidf_vect, y_train)

LinearSVC()

In [58]:
# we can add all of the above steps in the pipeline so we don't need to repeat these steps while predicting

In [60]:
from sklearn.pipeline import Pipeline

In [61]:
text_clf = Pipeline([("tfid",TfidfVectorizer()),("lsvc",LinearSVC())])

In [62]:
text_clf.fit(x_train,y_train)

Pipeline(steps=[('tfid', TfidfVectorizer()), ('lsvc', LinearSVC())])

In [63]:
prediction = text_clf.predict(x_test)

In [66]:
metrics.confusion_matrix(prediction,y_test)

array([[1445,   10],
       [   3,  214]], dtype=int64)

In [68]:
metrics.accuracy_score(prediction,y_test)

0.9922248803827751

In [77]:
text_clf.predict(["hi how are you! i have a great offer for you. just need you to click below and register and provide your card details."])

array(['ham'], dtype=object)

In [80]:
text_clf.predict(["congratulations! you have been selected as a winner"])

array(['spam'], dtype=object)