In [1]:
#Use-case: SMS Spam Classification
#Goal: You need to create a model that can predict whether the given sms is a spam or ham sms

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('SMSSpamCollection' , sep='\t', names=['label','message'])

In [4]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
#Unbalanced Dataset
# 1. Test for Generalization : Accuracy
# 2. Quality Check (Comparing CL): F1 Score

In [8]:
# Seperate data as features and label
features = data.iloc[:,[1]].values
label = data.iloc[:,0].values

In [10]:
# Perform Text Preprocessing
# Your creativity can be applied considering it resonates with the dataset.
#
# We will create a text preprocessing function that can perform the following:
# 1. Remove Punctuation
# 2. Extract words out of the sentences
# 3. Normalize data in lowercase
# 4. Remove stopwords
# ...

In [15]:
#Punctuation Removal Logic
import string
text = "Welcome to Simplilearn! You are learning NLP !"
processedtext = ''.join([ char for char in text if char not in string.punctuation ])

In [23]:
#Seperate words from string and normalize it

wordsInLowerCase = [word.lower() for word in processedtext.split(" ")]

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
#Remove Stopwords
import nltk
from nltk.corpus import stopwords

[word for word in wordsInLowerCase if word not in stopwords.words('english')]

['welcome', 'simplilearn', 'learning', 'nlp', '']

In [27]:
import string
from nltk.corpus import stopwords
def textPreprocessing(document):
  #Remove Punctuations
  processedtext = ''.join([ char for char in document if char not in string.punctuation ])
  ##Seperate words from string and normalize it
  wordsInLowerCase = [word.lower() for word in processedtext.split(" ")]
  # Generate Vocab
  return [word for word in wordsInLowerCase if word not in stopwords.words('english')]

In [28]:
textPreprocessing("Welcome to Simplilearn! You are learning NLP !")

['welcome', 'simplilearn', 'learning', 'nlp', '']

In [29]:
#Create BOW using Sklearn

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing)

#Build the Vocabulary

finalWordVectorVocab = wordVector.fit(features)

In [33]:
len(finalWordVectorVocab.vocabulary_)

13431

In [32]:
# THe above shows that punctuation still exists
# Figure out how to remove those using Sklearn countvectorizer method !!!

In [None]:
#fit_transform()

In [34]:
#To create BOW object
bagOfWords = finalWordVectorVocab.transform(features)

In [35]:
bagOfWords

<5572x13431 sparse matrix of type '<class 'numpy.int64'>'
	with 53461 stored elements in Compressed Sparse Row format>

In [36]:
#Apply TF IDF algo on BOW to create featureColumn

from sklearn.feature_extraction.text import TfidfTransformer
tfIdfObject = TfidfTransformer().fit(bagOfWords) #Calc IDF values

In [37]:
processedFeaturesCol = tfIdfObject.transform(bagOfWords)

In [66]:
# Create Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(processedFeaturesCol,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=6)


In [67]:
# Build the model using LogisticRegression

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [68]:

# Check the Quality of the Model
# SL received from the client is 0.35



In [69]:
#1. Test For Generalization

print("Training Score: ",model.score(X_train,y_train))
print("Testing Score: ",model.score(X_test,y_test))

Training Score:  0.9584922593672874
Testing Score:  0.9587443946188341


In [70]:
# Check the Quality of the Model
# SL received from the client is 0.35
# CL = 0.65

#Use F1 Score

from sklearn.metrics import classification_report
print(classification_report(label, model.predict(processedFeaturesCol)))

              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      4825
        spam       1.00      0.69      0.82       747

    accuracy                           0.96      5572
   macro avg       0.98      0.85      0.90      5572
weighted avg       0.96      0.96      0.96      5572



In [None]:
# Take Avg of F1 Score and compare it with CL
# (0.98 + 0.82) / 2 = 0.90 > CL
# Model Accepted !!!

In [75]:
# Deploy the model
#
# Input
smsInput = input("Enter SMS: ")
# Preprocess the Input

preprocessedFeature = textPreprocessing(smsInput)

# BOW
bowFeature = finalWordVectorVocab.transform(preprocessedFeature)

#TFIDF

actualFeature = tfIdfObject.transform(bowFeature)

#Predict

predLabel = model.predict(actualFeature)[0] #display whatever is max(voting)

print("Given SMS is a {} sms".format(predLabel))

Enter SMS: Win Lottery Guaranteed !
Given SMS is a spam sms


In [None]:
# Deployment
# finalWordVectorVocab
# tfIdfObject
# export textPreprocessing()