In [1]:
import pandas as pd
import numpy as np

In [2]:
messages=pd.read_csv('smsspamcollection/SMSSpamCollection',
                    sep='\t',names=["label","message"])
print(messages.shape)
messages.head()

(5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
from nltk.stem import WordNetLemmatizer as wnl
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
stopwords_eng=stopwords.words('english')
wnl=wnl()
def process_text_lemmatization(msg):
    msg=msg.lower()
    msg = re.sub(r'[^a-z0-9\s]', '', msg)
    words=word_tokenize(msg)
    words=[wnl.lemmatize(word) for word in words if word not in stopwords_eng]
    return " ".join(words)

In [4]:
X=[]
for msg in messages['message']:
    sent=process_text_lemmatization(msg)
    X.append(sent)

In [5]:
X

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s',
 'u dun say early hor u c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling 3 week word back id like fun still tb ok xxx std chgs send 150 rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press 9 copy friend callertune',
 'winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobile 11 month u r entitled update latest colour mobile camera free call mobile update co free 08002986030',
 'im gon na home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6days 16 

In [6]:
y = np.array(messages['label'].replace({'ham': 1, 'spam': 0}))
len(y)

  y = np.array(messages['label'].replace({'ham': 1, 'spam': 0}))


5572

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42, test_size=0.2)

In [8]:
len(X_train),len(y_train)

(4457, 4457)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score, recall_score, f1_score, roc_auc_score,roc_curve,confusion_matrix

In [10]:
models={
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
    "Logistic Regression":LogisticRegression(),
    "Knaive Bayes":MultinomialNB(),
}

In [11]:
def train_models(models,X_train,X_test,y_train,y_test):
    for model_name in models.keys():
        model=models[model_name]
        model.fit(X_train,y_train)

        # Make prediction
        y_train_pred=model.predict(X_train)
        y_test_pred=model.predict(X_test)

        # Training performance
        model_train_accuracy=accuracy_score(y_train,y_train_pred)
        model_train_f1=f1_score(y_train,y_train_pred)
        model_train_precision=precision_score(y_train,y_train_pred)
        model_train_recall=recall_score(y_train,y_train_pred)
        model_train_rocauc_score=roc_auc_score(y_train,y_train_pred)
        model_train_confusion_matrix=confusion_matrix(y_train,y_train_pred)

        # Testing performance
        model_test_accuracy=accuracy_score(y_test,y_test_pred)
        model_test_f1=f1_score(y_test,y_test_pred)
        model_test_precision=precision_score(y_test,y_test_pred)
        model_test_recall=recall_score(y_test,y_test_pred)
        model_test_rocauc_score=roc_auc_score(y_test,y_test_pred)
        model_test_confusion_matrix=confusion_matrix(y_test,y_test_pred)

        print('\n=============================================')
        print(model_name)
        print('=============================================\n')

        print('Model Performance for Training Data\n')
        print('- Accuracy:{:.4f}'.format(model_train_accuracy))
        print('- F1 Score:{:.4f}'.format(model_train_f1))
        print('- Precision:{:.4f}'.format(model_train_precision))
        print('- Recall:{:.4f}'.format(model_train_recall))
        print('- ROC AUC Score:{:.4f}'.format(model_train_rocauc_score))
        print('- Confusion Matrix:\n',model_train_confusion_matrix)

        print('\nModel Performance for Test Data\n')
        print('- Accuracy:{:.4f}'.format(model_test_accuracy))
        print('- F1 Score:{:.4f}'.format(model_test_f1))
        print('- Precision:{:.4f}'.format(model_test_precision))
        print('- Recall:{:.4f}'.format(model_test_recall))
        print('- ROC AUC Score:{:.4f}'.format(model_test_rocauc_score))
        print('- Confusion Matrix:\n',model_test_confusion_matrix)

### create BOW

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500,ngram_range=(1,2))

In [13]:
X_train_transformed=cv.fit_transform(X_train).toarray()
X_test_transformed=cv.transform(X_test)

In [14]:
train_models(models=models,X_train=X_train_transformed,X_test=X_test_transformed,y_train=y_train,y_test=y_test)


Decision Tree

Model Performance for Training Data

- Accuracy:0.9984
- F1 Score:0.9991
- Precision:0.9990
- Recall:0.9992
- ROC AUC Score:0.9963
- Confusion Matrix:
 [[ 594    4]
 [   3 3856]]

Model Performance for Test Data

- Accuracy:0.9525
- F1 Score:0.9724
- Precision:0.9770
- Recall:0.9679
- ROC AUC Score:0.9101
- Confusion Matrix:
 [[127  22]
 [ 31 935]]

Random Forest

Model Performance for Training Data

- Accuracy:0.9984
- F1 Score:0.9991
- Precision:0.9982
- Recall:1.0000
- ROC AUC Score:0.9941
- Confusion Matrix:
 [[ 591    7]
 [   0 3859]]

Model Performance for Test Data

- Accuracy:0.9767
- F1 Score:0.9866
- Precision:0.9806
- Recall:0.9928
- ROC AUC Score:0.9326
- Confusion Matrix:
 [[130  19]
 [  7 959]]

Gradient Boost

Model Performance for Training Data

- Accuracy:0.9762
- F1 Score:0.9865
- Precision:0.9733
- Recall:1.0000
- ROC AUC Score:0.9114
- Confusion Matrix:
 [[ 492  106]
 [   0 3859]]

Model Performance for Test Data

- Accuracy:0.9704
- F1 Score:0.9832


### using tf-idf

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer(max_features=500,ngram_range=(1,2))

In [16]:
X_train_trans=tf_idf.fit_transform(X_train)
X_test_trans=tf_idf.transform(X_test)

In [17]:
train_models(models=models,X_train=X_train_trans,X_test=X_test_trans,y_train=y_train,y_test=y_test)


Decision Tree

Model Performance for Training Data

- Accuracy:0.9984
- F1 Score:0.9991
- Precision:0.9990
- Recall:0.9992
- ROC AUC Score:0.9963
- Confusion Matrix:
 [[ 594    4]
 [   3 3856]]

Model Performance for Test Data

- Accuracy:0.9587
- F1 Score:0.9764
- Precision:0.9694
- Recall:0.9834
- ROC AUC Score:0.8910
- Confusion Matrix:
 [[119  30]
 [ 16 950]]

Random Forest

Model Performance for Training Data

- Accuracy:0.9982
- F1 Score:0.9990
- Precision:0.9979
- Recall:1.0000
- ROC AUC Score:0.9933
- Confusion Matrix:
 [[ 590    8]
 [   0 3859]]

Model Performance for Test Data

- Accuracy:0.9785
- F1 Score:0.9877
- Precision:0.9806
- Recall:0.9948
- ROC AUC Score:0.9337
- Confusion Matrix:
 [[130  19]
 [  5 961]]

Gradient Boost

Model Performance for Training Data

- Accuracy:0.9782
- F1 Score:0.9876
- Precision:0.9760
- Recall:0.9995
- ROC AUC Score:0.9203
- Confusion Matrix:
 [[ 503   95]
 [   2 3857]]

Model Performance for Test Data

- Accuracy:0.9713
- F1 Score:0.9836
