In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('SMSSpamCollection', sep = '\t', names = ['label','message'])

In [3]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.shape

(5572, 2)

In [5]:
text = data['message']
class_label = data['label']

In [6]:
import numpy as np
classes_list = ["ham","spam"]
label_index = class_label.apply(classes_list.index)
label = np.asarray(label_index)

In [7]:
label

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [8]:
text.shape

(5572,)

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (1,3))
x_train = vectorizer.fit_transform(X_train)
x_test = vectorizer.transform(X_test)


In [11]:
print(vectorizer.get_feature_names())



In [12]:
x_train.shape

(3733, 76722)

In [13]:
x_test.shape

(1839, 76722)

In [14]:
label

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [15]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [16]:
from sklearn.svm import SVC
model_SVM = SVC()
model_SVM.fit(x_train, y_train)
y_pred_SVM = model_SVM.predict(x_test)
print("SVM")
print("Accuracy score =", accuracy_score(y_test, y_pred_SVM))
print(metrics.classification_report(y_test, y_pred_SVM))



SVM
Accuracy score = 0.866231647634584
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1593
           1       0.00      0.00      0.00       246

    accuracy                           0.87      1839
   macro avg       0.43      0.50      0.46      1839
weighted avg       0.75      0.87      0.80      1839



  'precision', 'predicted', average, warn_for)


In [17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)
print("random")
print("Accuracy score =", accuracy_score(y_test, y_pred_rf))
print(metrics.classification_report(y_test, y_pred_rf))

random
Accuracy score = 0.9695486677542142
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1593
           1       1.00      0.77      0.87       246

    accuracy                           0.97      1839
   macro avg       0.98      0.89      0.93      1839
weighted avg       0.97      0.97      0.97      1839



In [18]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(x_train,y_train)
y_pred_LR = LR.predict(x_test)
print("Logistic Regression")
print("Accuracy score =", accuracy_score(y_test, y_pred_LR))
print(metrics.classification_report(y_test, y_pred_LR ))

Logistic Regression
Accuracy score = 0.9494290375203915
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1593
           1       1.00      0.62      0.77       246

    accuracy                           0.95      1839
   macro avg       0.97      0.81      0.87      1839
weighted avg       0.95      0.95      0.94      1839





In [19]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 5)
neigh.fit(x_train,y_train)
y_pred_KNN = neigh.predict(x_test)
print("KNN")
print("Accuracy score =", accuracy_score(y_test, y_pred_KNN))
print(metrics.classification_report(y_test, y_pred_KNN ))


KNN
Accuracy score = 0.8912452419793366
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1593
           1       1.00      0.19      0.32       246

    accuracy                           0.89      1839
   macro avg       0.94      0.59      0.63      1839
weighted avg       0.90      0.89      0.86      1839



In [20]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(x_train.toarray(),y_train)
y_pred_naive = naive.predict(x_test.toarray())
print("Naive Bayes")
print("Accuracy score =", accuracy_score(y_test, y_pred_naive))
print(metrics.classification_report(y_test, y_pred_naive ))


Naive Bayes
Accuracy score = 0.9668297988036977
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1593
           1       0.85      0.91      0.88       246

    accuracy                           0.97      1839
   macro avg       0.92      0.94      0.93      1839
weighted avg       0.97      0.97      0.97      1839



In [21]:
from sklearn.ensemble import GradientBoostingClassifier
gradient = GradientBoostingClassifier(n_estimators=100,max_depth=None,min_samples_split=2, random_state=0)
gradient.fit(x_train,y_train)
y_pred_gradient = gradient.predict(x_test)
print("Gradient Boosting")
print("Accuracy score =", accuracy_score(y_test, y_pred_gradient))
print(metrics.classification_report(y_test, y_pred_gradient ))


Gradient Boosting
Accuracy score = 0.9728113104948342
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1593
           1       0.96      0.83      0.89       246

    accuracy                           0.97      1839
   macro avg       0.97      0.91      0.94      1839
weighted avg       0.97      0.97      0.97      1839



In [22]:
from sklearn.tree import DecisionTreeClassifier
decision = DecisionTreeClassifier()
decision.fit(x_train,y_train)
y_pred_decision = decision.predict(x_test)
print("Decision Tree")
print("Accuracy score =", accuracy_score(y_test, y_pred_decision))
print(metrics.classification_report(y_test, y_pred_decision ))
    

Decision Tree
Accuracy score = 0.9684611201740077
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1593
           1       0.94      0.82      0.87       246

    accuracy                           0.97      1839
   macro avg       0.95      0.91      0.93      1839
weighted avg       0.97      0.97      0.97      1839

