In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
df = pd.read_csv('spam.csv', encoding = 'ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df.shape

(5572, 5)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
# Finding null values 
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
# Drop features with large amount of missing values
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.v1.value_counts(dropna=False)

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
# One Hot Encode the target variable 
y = pd.get_dummies(df['v1'], drop_first = True)
df = pd.concat([df, y], axis=1) 
df.drop('v1', axis=1, inplace=True)

In [10]:
df.head()

Unnamed: 0,v2,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [11]:
processed = df.v2.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'email')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case - Free, FREE, free are all the same word
processed = processed.str.lower()

# Now let's see our data 
print(processed)

  processed = df.v2.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'email')
  processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')
  processed = processed.str.replace(r'£|\$', 'moneysymb')
  processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
  processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')
  processed = processed.str.replace(r'[^\w\d\s]', ' ')
  processed = processed.str.replace(r'\s+', ' ')


0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                 will ì_ b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: v2, Length: 5572, dtype: object


  processed = processed.str.replace(r'^\s+|\s+?$', '')


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500) # Extracting top 1500 text only as features 
X = cv.fit_transform(processed).toarray() 

In [14]:
from nltk.stem import PorterStemmer

# Remove word stems using a Porter stemmer
ps = PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [16]:
X.shape

(5572, 1500)

In [17]:
from sklearn.model_selection import train_test_split

# Divide our data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [18]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train.values.ravel())
predictions = logreg.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print('Accuracy: {}% \n'.format(accuracy_score(y_test, predictions) * 100))
print('*'*100)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 98.47533632286995% 

****************************************************************************************************
[[963   2]
 [ 15 135]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.90      0.94       150

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115



In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 1)

knn.fit(X_train, y_train.values.ravel())

pred = knn.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

Accuracy: 97.21973094170404% 

****************************************************************************************************
[[961   4]
 [ 27 123]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       0.97      0.82      0.89       150

    accuracy                           0.97      1115
   macro avg       0.97      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [21]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()

dtree.fit(X_train, y_train.values.ravel())

pred = dtree.predict(X_test)


print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

Accuracy: 97.04035874439462% 

****************************************************************************************************
[[946  19]
 [ 14 136]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       965
           1       0.88      0.91      0.89       150

    accuracy                           0.97      1115
   macro avg       0.93      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [22]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train, y_train.values.ravel())
rfc_pred = rfc.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred) * 100))
print('*'*100)
print(confusion_matrix(y_test, rfc_pred))
print(classification_report(y_test, rfc_pred))

Accuracy: 97.04035874439462% 

****************************************************************************************************
[[964   1]
 [ 18 132]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [23]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train.values.ravel())

pred = mnb.predict(X_test)

print('Accuracy: {}% \n'.format(accuracy_score(y_test, pred)* 100))
print('*'*100)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

Accuracy: 98.47533632286995% 

****************************************************************************************************
[[959   6]
 [ 11 139]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.93      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115

