#Using Bag of Words Frequency (unigram)

In [193]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

In [194]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [195]:
stopWords = set(stopwords.words('english'))

In [196]:
datasets = pd.read_csv('spam.csv')
datasets.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [197]:
datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [198]:
vectorizer = CountVectorizer(stop_words=stopWords)

In [199]:
X = vectorizer.fit_transform(datasets.v2)
y = ['ham' if a=='ham' else 'spam' for a in datasets.v1]

In [200]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=35)

In [201]:
print("X_train: \n",X_train.toarray())
print("X_test: \n",X_test.toarray())

X_train: 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
X_test: 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [202]:
vectorizer.get_feature_names()



['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

##Naive Bayes

In [203]:
from sklearn.naive_bayes import MultinomialNB

In [204]:
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train,y_train)

MultinomialNB(alpha=0.1)

In [205]:
y_pred = clf.predict(X_test)

In [206]:
from sklearn.metrics import precision_score,recall_score

In [207]:
print("Precision is: ",precision_score(y_test,y_pred,average="weighted")*100)
print("Recall is: ",recall_score(y_test,y_pred,average="weighted")*100)

Precision is:  97.67997715637993
Recall is:  97.54784688995215


##Decision Tree

In [208]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(criterion="gini",max_leaf_nodes=35)

In [209]:
DT.fit(X_train,y_train)

DecisionTreeClassifier(max_leaf_nodes=35)

In [210]:
y_pred = DT.predict(X_test)

In [211]:
DT.score(X_test,y_test)*100

96.65071770334929

In [212]:
print("Precision is: ",precision_score(y_test,y_pred,average="weighted")*100)
print("Recall is: ",recall_score(y_test,y_pred,average="weighted")*100)

Precision is:  96.5743169955213
Recall is:  96.65071770334929


#Using Bag of Words Frequency (biagram)

In [213]:
datasets2 = pd.read_csv('spam.csv')
datasets2.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [214]:
vectorizer2 = CountVectorizer(stop_words=stopWords,ngram_range=(2, 2))

In [215]:
X2 = vectorizer2.fit_transform(datasets2.v2)
y2 = ['ham' if a=='ham' else 'spam' for a in datasets2.v1]

In [216]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2,test_size=0.3,random_state=35)

In [217]:
vectorizer2.get_feature_names()



['00 easter',
 '00 per',
 '00 sub',
 '00 subs',
 '000 bonus',
 '000 cash',
 '000 homeowners',
 '000 pounds',
 '000 price',
 '000 prize',
 '000 xmas',
 '000pes around',
 '008704050406 sp',
 '0089 last',
 '0121 2025050',
 '01223585236 xx',
 '01223585334 cum',
 '0125698789 ring',
 '02 06',
 '02 09',
 '02 claimcode',
 '02 user',
 '0207 083',
 '0207 153',
 '02072069400 bx',
 '02073162414 costs',
 '02085076972 reply',
 '021 3680',
 '03 05',
 '03 2nd',
 '03 final',
 '03 marsms',
 '04 call',
 '0430 jul',
 '05 05',
 '05 prize',
 '050703 csbcm4235wc1n3xx',
 '06 03',
 '06 05',
 '06 11',
 '06 good',
 '07 11',
 '07046744435 arrange',
 '07090298926 schedule',
 '07099833605 schedule',
 '07123456789 87077',
 '0721072 find',
 '07732584351 rodger',
 '07734396839 ibh',
 '07742676969 shows',
 '07753741225 shows',
 '0776xxxxxxx invited',
 '07781482378 com',
 '077xxx 000',
 '07801543489 guaranteed',
 '07808 xxxxxx',
 '07808247860 shows',
 '07808726822 awarded',
 '07815296484 shows',
 '078498 shows',
 '0789x

##Naive Bayes

In [218]:
from sklearn.naive_bayes import MultinomialNB

In [219]:
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train2,y_train2)

MultinomialNB(alpha=0.1)

In [220]:
y_pred2 = clf.predict(X_test2)

In [221]:
from sklearn.metrics import precision_score,recall_score

In [222]:
print("Precision is: ",precision_score(y_test2,y_pred2,average="weighted")*100)
print("Recall is: ",recall_score(y_test2,y_pred2,average="weighted")*100)

Precision is:  90.15379011197872
Recall is:  63.93540669856459


##Decision Tree

In [223]:
from sklearn.tree import DecisionTreeClassifier
DT2 = DecisionTreeClassifier(criterion="gini",max_leaf_nodes=35)

In [224]:
DT2.fit(X_train2,y_train2)

DecisionTreeClassifier(max_leaf_nodes=35)

In [225]:
y_pred2 = DT2.predict(X_test2)

In [226]:
DT2.score(X_test2,y_test2)*100

94.07894736842105

In [227]:
print("Precision is: ",precision_score(y_test2,y_pred2,average="weighted")*100)
print("Recall is: ",recall_score(y_test2,y_pred2,average="weighted")*100)

Precision is:  94.29687376927804
Recall is:  94.07894736842105


#Using TFIDF (unigram)

In [228]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [229]:
datasets4 = pd.read_csv('spam.csv')
datasets4.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [230]:
vectorizer4 = TfidfVectorizer(stop_words=stopWords)

In [231]:
X4 = vectorizer4.fit_transform(datasets4.v2)
y4 = ['ham' if a=='ham' else 'spam' for a in datasets4.v1]

In [232]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4,y4,test_size=0.3,random_state=35)

In [233]:
vectorizer4.get_feature_names()



['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '0125698789',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07090201529',
 '07090298926',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07880867867',
 '0789xxxxxxx',
 '07946746291',
 '0796xxxxxx',
 '07973788240',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '084

##Naive Bayes

In [234]:
from sklearn.naive_bayes import MultinomialNB

In [235]:
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train4,y_train4)

MultinomialNB(alpha=0.1)

In [236]:
y_pred4 = clf.predict(X_test4)

In [237]:
print("Precision is: ",precision_score(y_test4,y_pred4,average="weighted")*100)
print("Recall is: ",recall_score(y_test4,y_pred4,average="weighted")*100)

Precision is:  97.8008503000072
Recall is:  97.78708133971293


##Decision Tree

In [238]:
from sklearn.tree import DecisionTreeClassifier
DT4 = DecisionTreeClassifier(criterion="gini",max_leaf_nodes=35)

In [239]:
DT4.fit(X_train4,y_train4)

DecisionTreeClassifier(max_leaf_nodes=35)

In [240]:
y_pred4 = DT4.predict(X_test4)

In [241]:
DT4.score(X_test4,y_test4)*100

95.75358851674642

In [242]:
print("Precision is: ",precision_score(y_test4,y_pred4,average="weighted")*100)
print("Recall is: ",recall_score(y_test4,y_pred4,average="weighted")*100)

Precision is:  95.66855400221733
Recall is:  95.75358851674642


#Using TFIDF (biagram)

In [243]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [244]:
datasets3 = pd.read_csv('spam.csv')
datasets3.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [245]:
vectorizer3 = TfidfVectorizer(stop_words=stopWords,ngram_range=(2, 2))

In [246]:
X3 = vectorizer3.fit_transform(datasets3.v2)
y3 = ['ham' if a=='ham' else 'spam' for a in datasets3.v1]

In [247]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3,y3,test_size=0.3,random_state=35)

In [248]:
vectorizer3.get_feature_names()



['00 easter',
 '00 per',
 '00 sub',
 '00 subs',
 '000 bonus',
 '000 cash',
 '000 homeowners',
 '000 pounds',
 '000 price',
 '000 prize',
 '000 xmas',
 '000pes around',
 '008704050406 sp',
 '0089 last',
 '0121 2025050',
 '01223585236 xx',
 '01223585334 cum',
 '0125698789 ring',
 '02 06',
 '02 09',
 '02 claimcode',
 '02 user',
 '0207 083',
 '0207 153',
 '02072069400 bx',
 '02073162414 costs',
 '02085076972 reply',
 '021 3680',
 '03 05',
 '03 2nd',
 '03 final',
 '03 marsms',
 '04 call',
 '0430 jul',
 '05 05',
 '05 prize',
 '050703 csbcm4235wc1n3xx',
 '06 03',
 '06 05',
 '06 11',
 '06 good',
 '07 11',
 '07046744435 arrange',
 '07090298926 schedule',
 '07099833605 schedule',
 '07123456789 87077',
 '0721072 find',
 '07732584351 rodger',
 '07734396839 ibh',
 '07742676969 shows',
 '07753741225 shows',
 '0776xxxxxxx invited',
 '07781482378 com',
 '077xxx 000',
 '07801543489 guaranteed',
 '07808 xxxxxx',
 '07808247860 shows',
 '07808726822 awarded',
 '07815296484 shows',
 '078498 shows',
 '0789x

##Naive Bayes

In [249]:
from sklearn.naive_bayes import MultinomialNB

In [250]:
clf = MultinomialNB(alpha = 0.1)
clf.fit(X_train3,y_train3)

MultinomialNB(alpha=0.1)

In [251]:
y_pred3 = clf.predict(X_test3)

In [252]:
print("Precision is: ",precision_score(y_test3,y_pred3,average="weighted")*100)
print("Recall is: ",recall_score(y_test3,y_pred3,average="weighted")*100)

Precision is:  92.97471347501947
Recall is:  86.9019138755981


##Decision Tree

In [253]:
from sklearn.tree import DecisionTreeClassifier
DT3 = DecisionTreeClassifier(criterion="gini",max_leaf_nodes=35)

In [254]:
DT3.fit(X_train3,y_train3)

DecisionTreeClassifier(max_leaf_nodes=35)

In [255]:
y_pred3 = DT3.predict(X_test3)

In [256]:
DT3.score(X_test3,y_test3)*100

93.95933014354067

In [257]:
print("Precision is: ",precision_score(y_test3,y_pred3,average="weighted")*100)
print("Recall is: ",recall_score(y_test3,y_pred3,average="weighted")*100)

Precision is:  94.18679606735132
Recall is:  93.95933014354067
