# Importing Libraries

In [12]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

# Reading DataSet

In [2]:
df = pd.read_csv("spam.csv",encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df = df[['v2','v1']]
df.rename(columns={'v2':'Messages', 'v1':'Label'}, inplace = True)

In [5]:
df.head()

Unnamed: 0,Messages,Label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


# Data Preprocessing

In [6]:
SW = set(stopwords.words('english'))

def clean_text(text):
    #STEP 1 : Lower case
    text = text.lower()
    
    #STEP 2 : Removing Spacial Char
    text = re.sub(r'[^0-9a-zA-Z]',' ',text)
    
    #STEP 4 : Removing Extra Space
    text = re.sub(r'\s+',' ',text)
    
    #STEP 5 : Removing Stopwords
    text =" ".join(word for word in text.split() if word not in SW)
    
    return text
    
    

In [7]:
df['Clean_Text'] = df['Messages'].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,Messages,Label,Clean_Text
0,"Go until jurong point, crazy.. Available only ...",ham,go jurong point crazy available bugis n great ...
1,Ok lar... Joking wif u oni...,ham,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,U dun say so early hor... U c already then say...,ham,u dun say early hor u c already say
4,"Nah I don't think he goes to usf, he lives aro...",ham,nah think goes usf lives around though


# Split Data

In [9]:
x = df['Clean_Text']
y = df['Label']

# Model Training

Libraries

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [19]:
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                                ('tfidf', TfidfTransformer()),
                               ('clf', model)
                              ])
    pipeline_model.fit(x_train, y_train)
    
    print("accuracy: ",pipeline_model.score(x_test, y_test)*100)
    
#     cv_score = cross_val_score(model, x, y, cv= 5)
#     print('cv_score:', np.mean(cv_score)*100)
    
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))
    

In [20]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
classify(model, x, y)

accuracy:  96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



# NB

In [21]:
from sklearn.naive_bayes import MultinomialNB
model= MultinomialNB()
classify(model, x, y)

accuracy:  96.69777458722182
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.75      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393



# SVC

In [24]:
from sklearn.svm import SVC

model = SVC(C= 5)
classify(model, x, y)

accuracy:  98.27709978463747
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       1.00      0.87      0.93       187

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



# Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, x, y)

accuracy:  97.12849964106246
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       1.00      0.79      0.88       187

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

