In [8]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from time import time

In [9]:
data = pd.read_csv("SMSSpamCollection.tsv", names=['label', 'message'], sep='\t')
display (data.head())
data['label'] = data.label.map({'ham': 0, 'spam': 1})

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
RFC = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=111)
ABC = AdaBoostClassifier(n_estimators=10, random_state=1111)
DT = DecisionTreeClassifier(random_state=1111)
BC = BaggingClassifier(n_estimators=10, random_state=1111)

In [15]:
vectorizer = CountVectorizer(stop_words='english')
spam_vector = vectorizer.fit_transform(data["message"])
spam_features = vectorizer.get_feature_names()
X_train, X_test, y_train, y_test = train_test_split(spam_vector.toarray(), data['label'],
                                                   test_size=0.2)#, random_state=111)
X_train.shape, X_test.shape

((4457, 8444), (1115, 8444))

In [17]:
kfold = KFold(n_splits=4, random_state=111)

In [26]:
RFC_score = []
for z, (trainIdx, testIdx) in enumerate(kfold.split(data['message'])):
    print (f'K-fold {z}')
    start = time()
    X_train = spam_vector.toarray()[trainIdx]
    X_test = spam_vector.toarray()[testIdx]
    y_train = data['label'].values[trainIdx]
    y_test = data['label'].values[testIdx]
    RFC.fit(X_train, y_train)
    RFC_score.append(RFC.score(X_test, y_test))
    print (f"Time: {time()-start:.3f}s")

print ("RFC Scores:", RFC_score)

K-fold 0
Time: 6.730s
K-fold 1
Time: 6.154s
K-fold 2
Time: 6.366s
K-fold 3
Time: 6.509s
RFC Scores: [0.9712849964106246, 0.9676956209619526, 0.9669777458722182, 0.968413496051687]


In [28]:
ABC_score = []
for z, (trainIdx, testIdx) in enumerate(kfold.split(data['message'])):
    print (f'K-fold {z}')
    start = time()
    X_train = spam_vector.toarray()[trainIdx]
    X_test = spam_vector.toarray()[testIdx]
    y_train = data['label'].values[trainIdx]
    y_test = data['label'].values[testIdx]
    ABC.fit(X_train, y_train)
    ABC_score.append(ABC.score(X_test, y_test))
    print (f"Time: {time()-start:.3f}s")
    
print ("ABC Scores:", ABC_score)

K-fold 0
Time: 15.093s
K-fold 1
Time: 15.010s
K-fold 2
Time: 14.851s
K-fold 3
Time: 14.957s
ABC Scores: [0.9361091170136396, 0.9447236180904522, 0.9353912419239052, 0.9418521177315147]


In [29]:
BC_score = []
for z, (trainIdx, testIdx) in enumerate(kfold.split(data['message'])):
    print (f'K-fold {z}')
    start = time()
    X_train = spam_vector.toarray()[trainIdx]
    X_test = spam_vector.toarray()[testIdx]
    y_train = data['label'].values[trainIdx]
    y_test = data['label'].values[testIdx]
    BC.fit(X_train, y_train)
    BC_score.append(BC.score(X_test, y_test))
    print (f"Time: {time()-start:.3f}s")
    
print ("BC Scores:", BC_score)

K-fold 0
Time: 35.497s
K-fold 1
Time: 37.131s
K-fold 2
Time: 33.696s
K-fold 3
Time: 37.207s
BC Scores: [0.9720028715003589, 0.9741564967695621, 0.9676956209619526, 0.968413496051687]
