# SPAM Identification on SMS Data

* **Problem Statement:** Identify SPAMS on SMS data using Naive Bayes' classifier

* **Dataset:** https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

* **Feature Engineering:** Perform TFIDF

* **Model:** Build a Naive Bayes model using TFIDF score of text data for SPAM identification. Saved as a .pkl file

### __`Model File`__

In [8]:
#  saved model file
saved_model_file_path ='spam_identification_naive_bayes_model.pkl'

### __`All imports`__

In [9]:
#Libraries
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas , numpy, string
import pickle

### __`Load Model`__

In [10]:
spam_identification_naive_bayes_model = pickle.load(open(saved_model_file_path, 'rb'))

In [76]:
#Read Data
data = pandas.read_excel('D:\\Amazon Sagemaker\\SMSSpamCollection Data.xlsx',dtype=str)
data.shape

(5572, 2)

In [77]:
data.head(10)
data.columns

Index(['label', 'text'], dtype='object')

In [78]:
data['text'].dtype
#type(data)

dtype('O')

In [79]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [80]:
# split the dataset into training and validation datasets 
numpy.random.seed(24)
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(data[['text']], data['label'])

In [81]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [82]:
#train_x.shape
unique_elements, counts_elements = numpy.unique(train_y, return_counts=True)
print(numpy.asarray((unique_elements, counts_elements)))

[[   0    1]
 [3609  570]]


In [83]:
#valid_x.shape
unique_elements, counts_elements = numpy.unique(valid_y, return_counts=True)
print(numpy.asarray((unique_elements, counts_elements)))

[[   0    1]
 [1216  177]]


In [84]:
# TFIDF
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words='english',  max_features=100, binary= False)
#corpus_title = data['text'].tolist()
#tfidf_vect = TfidfVectorizer(min_df = 1,lowercase = False, ngram_range = (1,1), use_idf = True, stop_words='english')
tfidf_vect.fit(data['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x['text'])
xvalid_tfidf =  tfidf_vect.transform(valid_x['text'])

In [85]:
#Naive Bayes Model
classifier= naive_bayes.MultinomialNB()
classifier.fit(xtrain_tfidf, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [88]:
#Prediction on train data
train_predictions = classifier.predict(xtrain_tfidf)
conf_mat_train= metrics.confusion_matrix(train_y, train_predictions)
accuracy_train= metrics.accuracy_score(train_predictions, train_y)
print("confusion_matrix_train: \n", conf_mat_train)
print("accuracy_train: \n", accuracy_train)

confusion_matrix_train: 
 [[3574   35]
 [ 144  426]]
accuracy_train: 
 0.957166786312515


In [89]:
#Prediction on test/ validation data
valid_predictions = classifier.predict(xvalid_tfidf)
conf_mat_test= metrics.confusion_matrix(valid_y, valid_predictions)
accuracy_test= metrics.accuracy_score(valid_predictions, valid_y)
print("confusion_matrix_test: \n", conf_mat_test)
print("accuracy_test: \n", accuracy_test)

confusion_matrix_test: 
 [[1205   11]
 [  44  133]]
accuracy_test: 
 0.9605168700646087


In [91]:
import pickle

In [92]:
# save the model 
filename = 'D:\\Amazon Sagemaker\\final_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [93]:
# Load Model and make predictions
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(xvalid_tfidf, valid_y)
print(result)

0.9605168700646087
