In [168]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from scipy import stats
import numpy as np
import csv

In [169]:
file = open("ds-spam.csv", 'r')
csv_reader = csv.reader(file)

header = next(csv_reader)
#print(header)

X, y = [], []
for features in csv_reader:
  y.append(features.pop(-1))
  X.append(features)

X = np.array(X, dtype=float)
y = np.array(y, dtype=float)

X.shape, y.shape

((4601, 57), (4601,))

In [170]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y)

In [171]:
modelo = KNeighborsClassifier()
modelo.fit(X_tr, y_tr)
knn_pr = modelo.predict(X_te)
knnhits = knn_pr == y_te

In [172]:
modelo = GaussianNB()
modelo.fit(X_tr, y_tr)
gnb_pr = modelo.predict(X_te)
gnbhits = gnb_pr == y_te

In [173]:
modelo = Perceptron()
modelo.fit(X_tr, y_tr)
per_pr = modelo.predict(X_te)
perhits = per_pr == y_te

In [174]:
hits = np.stack((knnhits, gnbhits, perhits))

In [175]:
y_pr = np.stack((knn_pr, gnb_pr, per_pr))

In [176]:
y_pr = stats.mode(y_pr)[0][0]
vohits = y_pr == y_te

In [177]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('naivebayes', GaussianNB()),
    ('perceptron', Perceptron())
])
modelo.fit(X_tr, y_tr)
vo_pr = modelo.predict(X_te)
vohits = vo_pr == y_te
vohits, sum(vohits)/len(vohits)

(array([ True, False,  True, ...,  True, False,  True]), 0.8140747176368376)

In [178]:
modelo = VotingClassifier([
    ('knn1', KNeighborsClassifier(3)),
    ('knn2', KNeighborsClassifier(7)),
    ('knn3', KNeighborsClassifier(11)),
])
modelo.fit(X_tr, y_tr)
vo_pr = modelo.predict(X_te)
vohits = vo_pr == y_te
vohits, sum(vohits)/len(vohits)

(array([ True, False,  True, ...,  True,  True,  True]), 0.7966985230234579)

In [179]:
modelo = VotingClassifier([
    ('knn', KNeighborsClassifier()),
    ('naivebayes', GaussianNB()),
    ('arvore', DecisionTreeClassifier())
])
modelo.fit(X_tr, y_tr)
vo_pr = modelo.predict(X_te)
vohits = vo_pr == y_te
vohits, sum(vohits)/len(vohits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9218071242397915)

In [180]:
modelo = DecisionTreeClassifier()
modelo.fit(X_tr, y_tr)
dt_pr = modelo.predict(X_te)
dthits = dt_pr == y_te
dthits, sum(dthits)/len(dthits)

(array([ True,  True,  True, ...,  True,  True, False]), 0.9165942658557776)

In [181]:
modelo = BaggingClassifier()
modelo.fit(X_tr, y_tr)
bag_pr = modelo.predict(X_te)
baghits = bag_pr == y_te
baghits, sum(baghits)/len(baghits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9400521285838401)

In [182]:
modelo = BaggingClassifier(DecisionTreeClassifier())
modelo.fit(X_tr, y_tr)
bag_pr = modelo.predict(X_te)
baghits = bag_pr == y_te
baghits, sum(baghits)/len(baghits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9400521285838401)

In [183]:
modelo = BaggingClassifier(DecisionTreeClassifier(splitter='random'))
modelo.fit(X_tr, y_tr)
bag_pr = modelo.predict(X_te)
baghits = bag_pr == y_te
baghits, sum(baghits)/len(baghits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9409209383145091)

In [184]:
modelo = BaggingClassifier(DecisionTreeClassifier(splitter='random'), n_estimators=200)
modelo.fit(X_tr, y_tr)
bag_pr = modelo.predict(X_te)
baghits = bag_pr == y_te
baghits, sum(baghits)/len(baghits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9539530842745438)

In [185]:
modelo = RandomForestClassifier()
modelo.fit(X_tr, y_tr)
rf_pr = modelo.predict(X_te)
rfhits = rf_pr == y_te
rfhits, sum(rfhits)/len(rfhits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9548218940052129)

In [186]:
modelo = BaggingClassifier(DecisionTreeClassifier(splitter='random'), n_estimators=200, max_features=0.2)
modelo.fit(X_tr, y_tr)
bag_pr = modelo.predict(X_te)
baghits = bag_pr == y_te
baghits, sum(baghits)/len(baghits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9278887923544744)

In [187]:
modelo = ExtraTreesClassifier()
modelo.fit(X_tr, y_tr)
et_pr = modelo.predict(X_te)
ethits = et_pr == y_te
ethits, sum(ethits)/len(ethits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9496090356211989)

In [188]:
modelo = AdaBoostClassifier()
modelo.fit(X_tr, y_tr)
ab_pr = modelo.predict(X_te)
abhits = ab_pr == y_te
abhits, sum(abhits)/len(abhits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9348392701998263)

In [189]:
modelo = AdaBoostClassifier(DecisionTreeClassifier(max_depth=38, splitter='random'), learning_rate=0.2)
modelo.fit(X_tr, y_tr)
ab_pr = modelo.predict(X_te)
abhits = ab_pr == y_te
abhits, sum(abhits)/len(abhits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.946133796698523)

In [190]:
modelo = XGBClassifier(use_label_encoder=False)
modelo.fit(X_tr, y_tr)
xgb_pr = modelo.predict(X_te)
xgbhits = xgb_pr == y_te
xgbhits, sum(xgbhits)/len(xgbhits)

(array([False,  True,  True, ...,  True,  True,  True]), 0.945264986967854)

In [191]:
modelo = XGBClassifier(colsample_bynode=0.02, learning_rate=0.2)
modelo.fit(X_tr, y_tr)
xgb_pr = modelo.predict(X_te)
xgbhits = xgb_pr == y_te
xgbhits, sum(xgbhits)/len(xgbhits)

(array([False,  True,  True, ...,  True,  True,  True]), 0.9296264118158123)

In [192]:
voting = VotingClassifier([
        ('knn', KNeighborsClassifier()),
        ('naivebayes', GaussianNB()),
        ('perceptron', Perceptron())
    ])

modelo = StackingClassifier([
    ('voting', voting),
    ('extratrees', ExtraTreesClassifier()),
    ('randomforest', RandomForestClassifier())
], cv=3, passthrough=True)

modelo.fit(X_tr, y_tr) 
sc_pr = modelo.predict(X_te)
schits = sc_pr == y_te
schits, sum(schits)/len(schits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9426585577758471)

In [193]:
modelo = StackingClassifier([
    ('R1', RandomForestClassifier()),
    ('R2', RandomForestClassifier()),
    ('R3', RandomForestClassifier())
], cv=7, passthrough=True)

modelo.fit(X_tr, y_tr) 
sc_pr = modelo.predict(X_te)
schits = sc_pr == y_te
schits, sum(schits)/len(schits)

(array([ True,  True,  True, ...,  True,  True,  True]), 0.9496090356211989)