<a href="https://colab.research.google.com/github/fulasho/Predictive-Analytics/blob/main/NPL_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Let** **Import** **the** **necessary** **libraries**

In [None]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
email_data_df = pd.read_csv('https://raw.githubusercontent.com/glopez21/ML-Data/main/SMSSpamCollection.csv', names=['class','message'])

In [None]:
email_data_df = email_data_df.iloc[1: , :] # drop first row

# Let display the head

In [None]:
email_data_df.head()

Unnamed: 0,class,message
1,0.0,Go until jurong point
2,0.0,Ok lar... Joking wif u oni...
3,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
4,0.0,U dun say so early hor... U c already then say...
5,0.0,Nah I dont think he goes to usf


In [None]:
email_data_df['message'][1]

'Go until jurong point'

In [None]:
email_data_df['message'][3]

'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s'

In [None]:
len(email_data_df['message'][3])

150

In [None]:
email_data_df.dropna(inplace=True)

# Let remove the stopwords

In [None]:
def message_text_pre_process(text_message):
  remove_punct = [char for char in text_message if char not in string.punctuation]
  remove_punct = ''.join(remove_punct)
  remove_stopwords = [word for word in remove_punct.split() if word.lower() not in stopwords.words('english')]
  return remove_stopwords

In [None]:
email_data_df['message'].head(10).apply(message_text_pre_process)

1                                   [Go, jurong, point]
2                        [Ok, lar, Joking, wif, u, oni]
3     [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
4         [U, dun, say, early, hor, U, c, already, say]
5                         [Nah, dont, think, goes, usf]
6     [FreeMsg, Hey, darling, 3, weeks, word, back, ...
7     [Even, brother, like, speak, treat, like, aids...
8     [per, request, Melle, Melle, Oru, Minnaminungi...
9     [WINNER, valued, network, customer, selected, ...
10    [mobile, 11, months, U, R, entitled, Update, l...
Name: message, dtype: object

In [None]:
email_data_df.head()

Unnamed: 0,class,message
1,0.0,Go until jurong point
2,0.0,Ok lar... Joking wif u oni...
3,1.0,Free entry in 2 a wkly comp to win FA Cup fina...
4,0.0,U dun say so early hor... U c already then say...
5,0.0,Nah I dont think he goes to usf


In [None]:
email_data_df['class'] = email_data_df['class'].apply(lambda x : 'spam' if x == '1.0' else 'ham')

In [None]:
bag_of_words = CountVectorizer(analyzer=message_text_pre_process).fit(email_data_df['message'])

In [None]:
bag_of_words_trf = bag_of_words.transform(email_data_df['message'])
# The transform method form the CountVectorizer does blah blah blah

In [None]:
tfidf_fit = TfidfTransformer().fit(bag_of_words_trf)

In [None]:
tfidf_trf = tfidf_fit.transform(bag_of_words_trf)

In [None]:
spam_detector_model = MultinomialNB().fit(tfidf_trf,email_data_df['class'])

In [None]:
test_message = email_data_df['message'][3]

In [None]:
bag_of_words_test_message = bag_of_words.transform([test_message])

In [None]:
tfidf_test_messsge = tfidf_fit.transform(bag_of_words_test_message)

In [None]:
spam_detector_model.predict(tfidf_test_messsge)[0]

'spam'

In [None]:
email_data_df['class'][3]

'spam'

In [None]:
email_data_df['class'].value_counts()

ham     4823
spam     747
Name: class, dtype: int64

In [None]:
prediction_for_all_messages = spam_detector_model.predict(tfidf_trf)

In [None]:
print(prediction_for_all_messages)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(email_data_df['class'],prediction_for_all_messages))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      4823
        spam       1.00      0.81      0.90       747

    accuracy                           0.98      5570
   macro avg       0.99      0.91      0.94      5570
weighted avg       0.98      0.98      0.97      5570



## Pipeline Building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
msg_train, msg_test, class_train, class_test = train_test_split(email_data_df['message'],email_data_df['class'])

In [None]:
print(msg_train.shape)
print(msg_test.shape)
print(class_train.shape)
print(class_test.shape)

(4177,)
(1393,)
(4177,)
(1393,)


In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_pipeline = Pipeline([
                          ('bag_of_words',CountVectorizer(analyzer=message_text_pre_process)),
                          ('tfidf',TfidfTransformer()),
                          ('classifier', MultinomialNB())
])

In [None]:
text_pipeline.fit(msg_train,class_train)

Pipeline(steps=[('bag_of_words',
                 CountVectorizer(analyzer=<function message_text_pre_process at 0x7f8fb528f9e0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [None]:
text_pred = text_pipeline.predict(msg_test)

In [None]:
print(classification_report(text_pred,class_test))

              precision    recall  f1-score   support

         ham       1.00      0.95      0.97      1282
        spam       0.62      1.00      0.77       111

    accuracy                           0.95      1393
   macro avg       0.81      0.97      0.87      1393
weighted avg       0.97      0.95      0.96      1393



In [None]:
msg_test.iloc[0]

'Dear are you angry i was busy dear'