In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix

In [2]:
data=pd.read_csv(r"C:\Users\Anna\PycharmProjects\NLP\spam.csv",encoding='ISO-8859-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.shape

(5572, 5)

In [5]:
data['Unnamed: 2'].notna().sum()

50

In [6]:
data['Unnamed: 3'].notna().sum()

12

In [7]:
data['Unnamed: 4'].notna().sum()

6

In [8]:
data=data.iloc[:,:2]

In [9]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.columns = ['label', 'message']

In [11]:
data.isna().sum()

label      0
message    0
dtype: int64

In [12]:
data['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# Training a SVM and Random Forest models on TF-IDF vectorization of balanced dataset, accuracy is 0.962.

In [13]:
spam=data[data['label']=='spam']

In [14]:
ham=data[data['label']=='ham']

In [15]:
ham=ham.sample(spam.shape[0])

In [16]:
ham.shape,spam.shape

((747, 2), (747, 2))

In [17]:
balanced_dataset=spam.append(ham,ignore_index=True)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(balanced_dataset['message'], balanced_dataset['label'], test_size=0.33, random_state=42)

In [19]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', SVC())])
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)
print("Accuracy score  with SVM model is ",accuracy_score(y_test, y_pred).round(3),"for balanced dataset")

Accuracy score  with SVM model is  0.96 for balanced dataset


In [20]:
classifier_forest = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier(random_state=0))])
classifier_forest.fit(X_train, y_train)
y_pred=classifier_forest.predict(X_test)
print("Accuracy score  with Random Forest model is ",accuracy_score(y_test, y_pred).round(3),"for balanced dataset")

Accuracy score  with Random Forest model is  0.955 for balanced dataset


# Training a SVM and Random Forest models on TF-IDF vectorization of imbalanced dataset, f1 score is 0.99.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.33, random_state=42)

In [22]:
classifier_forest = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier(random_state=0))])
classifier_forest.fit(X_train, y_train)
y_pred=classifier_forest.predict(X_test)
print("f1 score with Random Forest model is ",f1_score(y_test, y_pred,pos_label='ham').round(3),"for imbalanced dataset")

f1 score with Random Forest model is  0.987 for imbalanced dataset


In [23]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', SVC(C=10, gamma=0.1))])
classifier.fit(X_train, y_train)
y_pred=classifier.predict(X_test)
print("f1 score  with SVM model is ",f1_score(y_test, y_pred,pos_label='ham').round(3),"for imbalanced dataset")
print("It's a best result.")
print('\n',confusion_matrix(y_test, y_pred))
print('\n',classification_report(y_test, y_pred))

f1 score  with SVM model is  0.991 for imbalanced dataset
It's a best result.

 [[1582    5]
 [  25  227]]

               precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1587
        spam       0.98      0.90      0.94       252

    accuracy                           0.98      1839
   macro avg       0.98      0.95      0.96      1839
weighted avg       0.98      0.98      0.98      1839



# Find best parameters for SVM model

In [24]:
tf_vectorizer=TfidfVectorizer(min_df=4,max_df=0.3,ngram_range=(1,3))
X_train_tfidf=tf_vectorizer.fit_transform(X_train)
X_test_tfidf=tf_vectorizer.transform(X_test)

In [25]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
classifier = SVC()  
grid = GridSearchCV(classifier, param_grid, refit = True, verbose = 3)
grid.fit(X_train_tfidf, y_train)
print(grid.best_params_)
print(grid.best_estimator_)
grid_predictions = grid.predict(X_test_tfidf)
print(classification_report(y_test, grid_predictions))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   1.9s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   1.9s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   1.9s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   1.9s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   1.9s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   1.1s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   2.6s
[CV 3/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   2.5s
[CV 4/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   2.7s
[CV 5/5] END ....................C=1000, gamma=1, kernel=rbf; total time=   2.8s
[CV 1/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   1.1s
[CV 2/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.9s
[CV 3/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   1.2s
[CV 4/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.9s
[CV 5/5] END ..................C=1000, gamma=0.1, kernel=rbf; total time=   0.8s
[CV 1/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.8s
[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.9s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.8s
[CV 4/5] END ...............