In [1]:
#!pyppeteer-install

In [2]:
#!pip install nltk

In [3]:
import numpy as np
import pandas as pd
import pickle

In [4]:
import pprint
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Data Collection 

In [5]:
data=pd.read_csv("/kaggle/input/dataset3/spam.csv")

In [6]:
df=pd.DataFrame(data)
print(df)

     Category                                            Message
0         ham  Go until jurong point, crazy.. Available only ...
1         ham                      Ok lar... Joking wif u oni...
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...
3         ham  U dun say so early hor... U c already then say...
4         ham  Nah I don't think he goes to usf, he lives aro...
...       ...                                                ...
5567     spam  This is the 2nd time we have tried 2 contact u...
5568      ham               Will ü b going to esplanade fr home?
5569      ham  Pity, * was in mood for that. So...any other s...
5570      ham  The guy did some bitching but I acted like i'd...
5571      ham                         Rofl. Its true to its name

[5572 rows x 2 columns]


## Data Analysis and Cleaning

In [7]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [9]:
df.duplicated().sum()
print(df[df.duplicated()])

     Category                                            Message
103       ham  As per your request 'Melle Melle (Oru Minnamin...
154       ham  As per your request 'Melle Melle (Oru Minnamin...
207       ham  As I entered my cabin my PA said, '' Happy B'd...
223       ham                             Sorry, I'll call later
326       ham                   No calls..messages..missed calls
...       ...                                                ...
5524     spam  You are awarded a SiPix Digital Camera! call 0...
5535      ham  I know you are thinkin malaria. But relax, chi...
5539      ham                         Just sleeping..and surfing
5553      ham                        Hahaha..use your brain dear
5558      ham                             Sorry, I'll call later

[415 rows x 2 columns]


In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [12]:
y=df['Category']

In [13]:
X=df['Message']

In [14]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5157, dtype: object

In [15]:
X[11:15]

11    SIX chances to win CASH! From 100 to 20,000 po...
12    URGENT! You have won a 1 week FREE membership ...
13    I've been searching for the right words to tha...
14                  I HAVE A DATE ON SUNDAY WITH WILL!!
Name: Message, dtype: object

In [16]:
def text_cleaning(corpus):

    #lower case the corpus
    corpus = str(corpus).lower()
    #print(corpus)

    #removing digits in the corpus
    import re
    corpus = re.sub(r'\d+','', corpus)
    #print("\n",corpus)

    #removing punctuations
    import string
    corpus = corpus.translate(str.maketrans('', '', string.punctuation))
    #print("\n",corpus)

    #removing trailing whitespaces
    corpus = ' '.join([token for token in corpus.split()])
    #print("\n",corpus)

    tokenized_corpus_nltk = word_tokenize(corpus)
    #print("\nNLTK\nTokenized corpus\n",tokenized_corpus_nltk)

    stop_words_nltk = set(stopwords.words('english'))
    #pprint.pprint(stop_words_nltk)

    tokenized_corpus_without_stopwords = [i for i in tokenized_corpus_nltk if i not in stop_words_nltk]
    #print("Tokenized corpus without stopwords:"," ".join(tokenized_corpus_without_stopwords))
    return " ".join(tokenized_corpus_without_stopwords)

In [17]:
X=X.apply(text_cleaning)

In [18]:
X[2]

'free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply overs'

## Features Extraction

In [19]:
from sklearn.preprocessing import LabelEncoder
Encoder=LabelEncoder()

In [20]:
y=Encoder.fit_transform(y)
print(Encoder.classes_)
print(Encoder.transform(Encoder.classes_))
label_mapping = dict(zip(Encoder.classes_, Encoder.transform(Encoder.classes_)))
print("Label Mapping:",label_mapping)
print(y)

['ham' 'spam']
[0 1]
Label Mapping: {'ham': 0, 'spam': 1}
[0 0 1 ... 0 0 0]


### Train-Test-split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
print(len(X_train))

4125


In [24]:
print(len(y_train))

4125


## Bag of Words

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv=CountVectorizer()

In [27]:
X_train_bow=cv.fit_transform(X_train)
print(X_train_bow.shape)

(4125, 7367)


In [28]:
X_test_bow=cv.transform(X_test)
print(X_test_bow.shape)

(1032, 7367)


In [29]:
dictionary=cv.vocabulary_
#print(dictionary)
inverse_dict_values={u:v for v,u in dictionary.items()}
#print(inverse_dict_values)

In [30]:
#dir(cv)

In [31]:
X_train_bow=cv.fit_transform(X_train).toarray()
X_test_bow=cv.transform(X_test).toarray()
print(X_test_bow.shape)

(1032, 7367)


In [32]:
#print(cv.vocabulary_)

In [33]:
with open('BOWvectorizer.pkl', 'wb') as file:
    pickle.dump(cv, file)

## Bag of N-Grames

In [34]:
cvn=CountVectorizer(ngram_range=(1,2))

In [35]:

X_train_bown=cvn.fit_transform(X_train).toarray()
print(X_train_bown.shape)

(4125, 32507)


In [36]:
X_test_bown=cvn.transform(X_test).toarray()
print(X_test_bown.shape)

(1032, 32507)


In [37]:
#print(cvn.vocabulary_)

In [38]:
with open('BOWNvectorizer.pkl', 'wb') as file:
    pickle.dump(cvn, file)

# TF-IDF

In [39]:
test_text=X_train[0].split()
print(test_text)

['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [41]:
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
print(X_train_tfidf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [42]:
X_test_tfidf=tfidf.transform(X_test).toarray()

In [43]:
count=0
list1=[]
for i in X_train_tfidf[0]:
    if i==0.:
        count=count+1
        continue
    else:
        print(i)
        list1.append(count)
        count=count+1
print(list1)

0.3802703024435071
0.37367249513124434
0.3802703024435071
0.39901305453678027
0.28044746411394905
0.3411036924718259
0.2402759164253372
0.39901305453678027
[2365, 2544, 2893, 2955, 3619, 3988, 5392, 6521]


In [44]:
print(X_train_tfidf[2])

[0. 0. 0. ... 0. 0. 0.]


In [45]:
print(test_text)

['go', 'jurong', 'point', 'crazy', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amore', 'wat']


In [46]:
with open('TFIDFvectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

## Find the Index values of the words:

In [47]:
voc=tfidf.vocabulary_
list2=[]
for i in test_text:
    index = voc.get(i, -1)
    list2.append(index)
print(list2)

[2496, 3247, 4746, 1372, 428, 834, -1, 2578, 7148, 3377, -1, 832, 1124, 2544, 222, 6936]


## Togather the index values and it value:

In [48]:
dictionary=tfidf.vocabulary_
#print(dictionary)
inverse_dict_values={u:v for v,u in dictionary.items()}
#print(inverse_dict_values)

In [49]:
# Sort the dictionary values and maintain corresponding keys
sorted_keys_values = [(key, value) for key, value in sorted(inverse_dict_values.items(), key=lambda item: item[1])]
#print(sorted_keys_values)
# Print the sorted keys and values
for i in list2:
    print(sorted_keys_values[i])

(2496, 'go')
(3247, 'jurong')
(4746, 'point')
(1372, 'crazy')
(428, 'available')
(834, 'bugis')
(7366, '〨ud')
(2578, 'great')
(7148, 'world')
(3377, 'la')
(7366, '〨ud')
(832, 'buffet')
(1124, 'cine')
(2544, 'got')
(222, 'amore')
(6936, 'wat')


## Find the TF-IDF values by using its Index value:

In [50]:
row=0
print(X_train_tfidf[0])
dicts={}
for i in list2:
    count=0
    for j in X_train_tfidf[0]:
        if count==i:
            dicts[count]=X_train_tfidf[row][count]
            break
        else:
            count=count+1
print(dicts)
             
            
        
    

[0. 0. 0. ... 0. 0. 0.]
{2496: 0.0, 3247: 0.0, 4746: 0.0, 1372: 0.0, 428: 0.0, 834: 0.0, 2578: 0.0, 7148: 0.0, 3377: 0.0, 832: 0.0, 1124: 0.0, 2544: 0.37367249513124434, 222: 0.0, 6936: 0.0}


## Creating Model 

## Naive Bayes:

#### a) fit the Bog of word train dataset

In [51]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score


In [52]:
from sklearn.naive_bayes import GaussianNB
bow_gnb= GaussianNB()

In [53]:
bow_gnb.fit(X_train_bow,y_train)
y_pred_bow=bow_gnb.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))


Precision : 0.5381165919282511
Recall : 0.8823529411764706
Accuracy : 0.8846899224806202
F1 Score : 0.6685236768802229
Confusion Matrix :
 [[793 103]
 [ 16 120]]


In [54]:
with open('BOW_gnb.pkl', 'wb') as file:
    pickle.dump(bow_gnb, file)

#### b) fit the Bog of n-grams words train dataset

In [55]:
bown_gnb= GaussianNB()

In [56]:
bown_gnb.fit(X_train_bown,y_train)
y_pred_bown=bown_gnb.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 0.625
Recall : 0.9191176470588235
Accuracy : 0.9166666666666666
F1 Score : 0.7440476190476192
Confusion Matrix :
 [[821  75]
 [ 11 125]]


In [57]:
with open('BOWN_gnb.pkl', 'wb') as file:
    pickle.dump(bown_gnb, file)

#### c) fit the TF-IDF train dataset

In [58]:
tfidf_gnb= GaussianNB()

In [59]:
tfidf_gnb.fit(X_train_tfidf,y_train)
y_pred_tfidf=tfidf_gnb.predict(X_test_tfidf)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 0.5302325581395348
Recall : 0.8382352941176471
Accuracy : 0.8808139534883721
F1 Score : 0.6495726495726495
Confusion Matrix :
 [[795 101]
 [ 22 114]]


In [60]:
with open('tfidf_gnb.pkl', 'wb') as file:
    pickle.dump(tfidf_gnb, file)

## Random forest:

In [61]:
from sklearn.ensemble import RandomForestClassifier
RF_bow=RandomForestClassifier()

In [62]:
RF_bow.fit(X_train_bow,y_train)
y_pred_bow=RF_bow.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))

Precision : 0.990909090909091
Recall : 0.8014705882352942
Accuracy : 0.9728682170542635
F1 Score : 0.8861788617886179
Confusion Matrix :
 [[895   1]
 [ 27 109]]


In [63]:
with open('BOW_RF.pkl', 'wb') as file:
    pickle.dump(RF_bow, file)

In [64]:
RF_bown=RandomForestClassifier()

In [65]:
RF_bown.fit(X_train_bown,y_train)
y_pred_bown=RF_bown.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 0.9901960784313726
Recall : 0.7426470588235294
Accuracy : 0.9651162790697675
F1 Score : 0.8487394957983194
Confusion Matrix :
 [[895   1]
 [ 35 101]]


In [66]:
with open('BOWN_RF.pkl', 'wb') as file:
    pickle.dump(RF_bown, file)

In [67]:
RF_tfidf=RandomForestClassifier()

In [68]:
RF_tfidf.fit(X_train_tfidf,y_train)
y_pred_tfidf=RF_tfidf.predict(X_test_tfidf)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 0.9911504424778761
Recall : 0.8235294117647058
Accuracy : 0.9757751937984496
F1 Score : 0.8995983935742972
Confusion Matrix :
 [[895   1]
 [ 24 112]]


In [69]:
with open('tfidf_RF.pkl', 'wb') as file:
    pickle.dump(RF_tfidf, file)

## Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression
LR_bow=LogisticRegression()

In [71]:
LR_bow.fit(X_train_bow,y_train)
y_pred_bow=LR_bow.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))

Precision : 0.9752066115702479
Recall : 0.8676470588235294
Accuracy : 0.9796511627906976
F1 Score : 0.9182879377431907
Confusion Matrix :
 [[893   3]
 [ 18 118]]


In [72]:
with open('BOW_LR.pkl', 'wb') as file:
    pickle.dump(LR_bow, file)

In [73]:
LR_bown=LogisticRegression()

In [74]:
LR_bown.fit(X_train_bown,y_train)
y_pred_bown=LR_bown.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 0.9913793103448276
Recall : 0.8455882352941176
Accuracy : 0.9786821705426356
F1 Score : 0.9126984126984128
Confusion Matrix :
 [[895   1]
 [ 21 115]]


In [75]:
with open('BOWN_LR.pkl', 'wb') as file:
    pickle.dump(LR_bown, file)

In [76]:
LR_tfidf=LogisticRegression()

In [77]:
LR_tfidf.fit(X_train_bown,y_train)
y_pred_tfidf=LR_tfidf.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 0.9913793103448276
Recall : 0.8455882352941176
Accuracy : 0.9786821705426356
F1 Score : 0.9126984126984128
Confusion Matrix :
 [[895   1]
 [ 21 115]]


In [78]:
with open('tfidf_LR.pkl', 'wb') as file:
    pickle.dump(LR_tfidf, file)

## Decision Tree Classifier:

In [79]:
from sklearn.tree import DecisionTreeClassifier
DT_bow=DecisionTreeClassifier()

In [80]:
DT_bow.fit(X_train_bow,y_train)
y_pred_bow=DT_bow.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))

Precision : 0.8837209302325582
Recall : 0.8382352941176471
Accuracy : 0.9641472868217055
F1 Score : 0.8603773584905661
Confusion Matrix :
 [[881  15]
 [ 22 114]]


In [81]:
with open('BOW_DT.pkl', 'wb') as file:
    pickle.dump(DT_bow, file)

In [82]:
DT_bown=DecisionTreeClassifier()

In [83]:
DT_bown.fit(X_train_bown,y_train)
y_pred_bown=DT_bown.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 0.896
Recall : 0.8235294117647058
Accuracy : 0.9641472868217055
F1 Score : 0.8582375478927202
Confusion Matrix :
 [[883  13]
 [ 24 112]]


In [84]:
with open('BOWN_DT.pkl', 'wb') as file:
    pickle.dump(DT_bown, file)

In [85]:
DT_tfidf=DecisionTreeClassifier()

In [86]:
DT_tfidf.fit(X_train_tfidf,y_train)
y_pred_tfidf=DT_tfidf.predict(X_test_tfidf)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 0.835820895522388
Recall : 0.8235294117647058
Accuracy : 0.9554263565891473
F1 Score : 0.8296296296296296
Confusion Matrix :
 [[874  22]
 [ 24 112]]


In [87]:
with open('tfidf_DT.pkl', 'wb') as file:
    pickle.dump(DT_tfidf, file)

## Support Vector Machine

In [88]:
from sklearn.svm import SVC

In [89]:
svm_bow=SVC()

In [90]:
svm_bow.fit(X_train_bow,y_train)
y_pred_bow=svm_bow.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))

Precision : 0.9745762711864406
Recall : 0.8455882352941176
Accuracy : 0.9767441860465116
F1 Score : 0.905511811023622
Confusion Matrix :
 [[893   3]
 [ 21 115]]


In [91]:
with open('BOW_SVM.pkl', 'wb') as file:
    pickle.dump(svm_bow, file)

In [92]:
svm_bown=SVC()

In [93]:
svm_bown.fit(X_train_bown,y_train)
y_pred_bown=svm_bown.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 0.9826086956521739
Recall : 0.8308823529411765
Accuracy : 0.9757751937984496
F1 Score : 0.900398406374502
Confusion Matrix :
 [[894   2]
 [ 23 113]]


In [94]:
with open('BOWN_SVM.pkl', 'wb') as file:
    pickle.dump(svm_bown, file)

In [95]:
svm_tfidf=SVC()

In [96]:
svm_tfidf.fit(X_train_tfidf,y_train)
y_pred_tfidf=svm_tfidf.predict(X_test_tfidf)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 0.9745762711864406
Recall : 0.8455882352941176
Accuracy : 0.9767441860465116
F1 Score : 0.905511811023622
Confusion Matrix :
 [[893   3]
 [ 21 115]]


In [97]:
with open('tfidf_SVM.pkl', 'wb') as file:
    pickle.dump(svm_tfidf, file)

## K-nearest Neighbors:

In [98]:
from sklearn.neighbors import KNeighborsClassifier
KNN_bow=KNeighborsClassifier()

In [99]:
KNN_bow.fit(X_train_bow,y_train)
y_pred_bow=KNN_bow.predict(X_test_bow)
print("Precision :",precision_score(y_test,y_pred_bow))
print("Recall :",recall_score(y_test,y_pred_bow))
print("Accuracy :",accuracy_score(y_test,y_pred_bow))
print("F1 Score :",f1_score(y_test,y_pred_bow))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bow))

Precision : 1.0
Recall : 0.36764705882352944
Accuracy : 0.9166666666666666
F1 Score : 0.5376344086021506
Confusion Matrix :
 [[896   0]
 [ 86  50]]


In [100]:
with open('BOW_KNN.pkl', 'wb') as file:
    pickle.dump(KNN_bow, file)

In [101]:
KNN_bown=KNeighborsClassifier()

In [102]:
KNN_bown.fit(X_train_bown,y_train)
y_pred_bown=KNN_bown.predict(X_test_bown)
print("Precision :",precision_score(y_test,y_pred_bown))
print("Recall :",recall_score(y_test,y_pred_bown))
print("Accuracy :",accuracy_score(y_test,y_pred_bown))
print("F1 Score :",f1_score(y_test,y_pred_bown))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_bown))

Precision : 1.0
Recall : 0.25
Accuracy : 0.9011627906976745
F1 Score : 0.4
Confusion Matrix :
 [[896   0]
 [102  34]]


In [103]:
with open('BOWN_KNN.pkl', 'wb') as file:
    pickle.dump(KNN_bown, file)

In [104]:
KNN_tfidf=KNeighborsClassifier()

In [105]:
KNN_tfidf.fit(X_train_tfidf,y_train)
y_pred_tfidf=KNN_tfidf.predict(X_test_tfidf)
print("Precision :",precision_score(y_test,y_pred_tfidf))
print("Recall :",recall_score(y_test,y_pred_tfidf))
print("Accuracy :",accuracy_score(y_test,y_pred_tfidf))
print("F1 Score :",f1_score(y_test,y_pred_tfidf))
print("Confusion Matrix :\n",confusion_matrix(y_test,y_pred_tfidf))

Precision : 1.0
Recall : 0.34558823529411764
Accuracy : 0.9137596899224806
F1 Score : 0.5136612021857923
Confusion Matrix :
 [[896   0]
 [ 89  47]]


In [106]:
with open('tfidf_KNN.pkl', 'wb') as file:
    pickle.dump(KNN_tfidf, file)

## test the model by using one Example

In [107]:
userinput="I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."

In [108]:
ui=text_cleaning(userinput)

In [109]:
print(ui)

im gon na home soon dont want talk stuff anymore tonight k ive cried enough today


In [110]:
dd=cv.vocabulary_
df=ui.split()
print(df)

['im', 'gon', 'na', 'home', 'soon', 'dont', 'want', 'talk', 'stuff', 'anymore', 'tonight', 'k', 'ive', 'cried', 'enough', 'today']


In [111]:
list1=[]
for i in df:
    if i in dd.keys():
        list1.append(i)
    else:
        pass
print(len(list1))

15


In [112]:
ui

'im gon na home soon dont want talk stuff anymore tonight k ive cried enough today'

In [113]:
d=cv.transform([ui])
print(d.shape)
print(d)

(1, 7367)
  (0, 270)	1
  (0, 1387)	1
  (0, 1736)	1
  (0, 1935)	1
  (0, 2517)	1
  (0, 2832)	1
  (0, 2988)	1
  (0, 3146)	1
  (0, 4119)	1
  (0, 5802)	1
  (0, 6045)	1
  (0, 6202)	1
  (0, 6458)	1
  (0, 6492)	1
  (0, 6918)	1


In [114]:
d=cv.transform([ui]).toarray()
print(d.shape)
print(d)

(1, 7367)
[[0 0 0 ... 0 0 0]]


In [115]:
result=bow_gnb.predict(d)

In [116]:
print(result[0])

0
