# NLP process 
- Text analysis  50%
- Text Transformation 40%
- Model development 5%
- ui application 5%

# text analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
print(plt.style.available)

['Solarize_Light2', '_classic_test_patch', '_mpl-gallery', '_mpl-gallery-nogrid', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']


In [3]:
# plt.style.available    --- use to see available themes
plt.style.use('ggplot')

In [4]:
sns.set_style('whitegrid')

```
pip install nltk
```

In [5]:
import nltk

In [6]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    | 

True

In [7]:
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize

In [8]:
messages = pd.read_csv('../datasets/spam_ham_dataset.csv',index_col=0)
messages.rename({'label':'category'},axis=1,inplace=True)
messages

Unnamed: 0,category,text,label_num
605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
4685,spam,"Subject: photoshop , windows , office . cheap ...",1
2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
1409,ham,Subject: industrial worksheets for august 2000...,0


# text tranformation

In [9]:
from nltk.stem import SnowballStemmer

In [10]:
def cleanText(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation))
    words = word_tokenize(text)
    # print(" ".join(words),len(words))
    words = [w for w in words if w not in stopwords.words('english')]
    stemmer = SnowballStemmer('english')
    words = [stemmer.stem(w) for w in words]
    # print(" ".join(words),len(words))
    return " ".join(words)

In [11]:
messages.text = messages.text.apply(cleanText)

In [12]:
messages.head()

Unnamed: 0,category,text,label_num
605,ham,subject enron methanol meter 988291 follow not...,0
2349,ham,subject hpl nom januari 9 2001 see attach file...,0
3624,ham,subject neon retreat ho ho ho around wonder ti...,0
4685,spam,subject photoshop window offic cheap main tren...,1
2030,ham,subject indian spring deal book teco pvr reven...,0


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [36]:
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(messages.text).toarray()
X.shape

(5171, 1000)

In [30]:
tf = TfidfTransformer()
tX = tf.fit_transform(X).toarray()
tX.shape

(5171, 1000)

Train the model

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report

In [39]:
xtrain,xtest,ytrain,ytest = train_test_split(tX,messages.label_num,test_size=0.2,random_state=123)

In [40]:
clf = GaussianNB()
clf.fit(xtrain,ytrain)
ypred = clf.predict(xtest)

In [41]:
print(confusion_matrix(ytest,ypred))

[[719  28]
 [ 13 275]]


In [42]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       747
           1       0.91      0.95      0.93       288

    accuracy                           0.96      1035
   macro avg       0.94      0.96      0.95      1035
weighted avg       0.96      0.96      0.96      1035



In [44]:
email = messages.text[4685]
clean_email = cleanText(email)
X = tf.transform(cv.transform([clean_email]).toarray()).toarray()
p = clf.predict(X)
if p[0] == 1:
    print("Spam")
else:
    print("Ham")

Spam


In [37]:
from joblib import dump

In [45]:
clf_dict = {
    'clf':clf,
    'count_vec':cv,
    'tfidf_vec':tf,
    'title':'spam detector'
}

dump(clf_dict, "spam_predictor.pkl")

['spam_predictor.pkl']