# Spam dataset with ensemble methods

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# Some functions to plot our points and draw the lines
def plot_points(features, labels, fix_margins=True):
    X = np.array(features)
    y = np.array(labels)
    spam = X[np.argwhere(y==1)]
    ham = X[np.argwhere(y==0)]
    if fix_margins:
        plt.xlim(0, 11)
        plt.ylim(0, 11)
    plt.scatter([s[0][0] for s in spam],
                [s[0][1] for s in spam],
                s = 25,
                color = 'cyan',
                edgecolor = 'k',
                marker = '^')
    plt.scatter([s[0][0] for s in ham],
                [s[0][1] for s in ham],
                s = 25,
                color = 'red',
                edgecolor = 'k',
                marker = 's')
    plt.xlabel('Lottery')
    plt.ylabel('Sale')
    plt.legend(['Spam','Ham'])

def plot_model(X, y, model, fix_margins=True):
    X = np.array(X)
    y = np.array(y)
    plot_step = 0.2
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    if fix_margins:
        x_min=0
        y_min=0
        x_max=12
        y_max=12
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, colors=['red', 'blue'], alpha=0.2, levels=range(-1,2))
    plt.contour(xx, yy, Z,colors = 'k',linewidths = 1)
    plot_points(X, y)
    plt.show()

def plot_trees(model):
    estimators = gradient_boosting_model.estimators_
    for i in range(len(estimators)):
        tree.plot_tree(estimators[i][0])
        plt.show()
        #plot_model(new_X, new_y, estimators[i][0])

In [None]:
spam_dataset = pd.DataFrame({
    'Lottery':[7,3,9,1,2,4,1,3,6,7,8,9],
    'Sale':[1,2,3,3,6,7,9,10,5,8,4,6],
    'Spam': [0,0,0,0,0,0,1,1,1,1,1,1]}) #'no','no','no','no','no','no','yes','yes','yes','yes','yes','yes'

spam_dataset

In [None]:
X = spam_dataset[['Lottery', 'Sale']]
y = spam_dataset['Spam']

plot_points(X, y)

In [None]:
X = spam_dataset[['Lottery', 'Sale']]
y = spam_dataset['Spam']
spam_decision_tree = DecisionTreeClassifier(random_state=0)
spam_decision_tree.fit(X,y)
spam_decision_tree.score(X,y)

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(spam_decision_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
plot_model(X, y, spam_decision_tree)

# Now with a slightly modified dataset

In [None]:
new_spam_dataset = pd.DataFrame({
    'Lottery':[7,3,9,1,2,4,1,3,6,7,8,9,8,2],
    'Sale':   [1,2,3,3,6,7,9,10,5,8,4,6,5,2],
    'Spam':   [0,0,0,0,0,0,1,1,1,1,1,1,0,1]}) #'no','no','no','no','no','no','yes','yes','yes','yes','yes','yes'

#new_spam_dataset = pd.DataFrame({
#    'Lottery':[7,3,9,1,2,4,6,1,3,6,7,8,9,3],
#    'Sale':[1,2,3,3,6,7,8,9,10,5,8,4,6,4],
#    'Spam': [0,0,0,0,0,0,0,1,1,1,1,1,1,1]}) #'no','no','no','no','no','no','yes','yes','yes','yes','yes','yes'

new_spam_dataset

#new_spam_dataset = pd.DataFrame({
#    'Lottery':[7,3,9,1,2,4,8,1,3,6,7,8,9,2],
#    'Sale':[1,2,3,3,6,7,6,9,10,5,8,4,6,2],
#    'Spam': ['no','no','no','no','no','no','no','yes','yes','yes','yes','yes','yes','yes']})
#new_spam_dataset

In [None]:
new_X = new_spam_dataset[['Lottery', 'Sale']]
new_y = new_spam_dataset['Spam']
plot_points(new_X, new_y)

In [None]:
new_spam_decision_tree = DecisionTreeClassifier(random_state=0)
new_spam_decision_tree.fit(new_X,new_y)
new_spam_decision_tree.score(new_X,new_y)

In [None]:
from sklearn import tree
tree.plot_tree(new_spam_decision_tree)

In [None]:
dot_data = StringIO()
export_graphviz(new_spam_decision_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
plot_model(new_X, new_y, new_spam_decision_tree)

# Random Forests

In [None]:
first_batch = new_spam_dataset.loc[[0,1,2,7,8,9]]
X1 = first_batch[['Lottery', 'Sale']]
y1 = first_batch['Spam']
plot_points(X1, y1)
plt.show()

second_batch = new_spam_dataset.loc[[3,4,10,11]]
X2 = second_batch[['Lottery', 'Sale']]
y2 = second_batch['Spam']
plot_points(X2, y2)
plt.show()

third_batch = new_spam_dataset.loc[[5,6,12,13]]
X3 = third_batch[['Lottery', 'Sale']]
y3 = third_batch['Spam']
plot_points(X3, y3)

In [None]:
dt1 = DecisionTreeClassifier(random_state=0)
dt1.fit(X1,y1)
print(dt1.score(X1,y1))

dt2 = DecisionTreeClassifier(random_state=0)
dt2.fit(X2,y2)
print(dt2.score(X2,y2))

dt3 = DecisionTreeClassifier(random_state=0)
dt3.fit(X3,y3)
print(dt3.score(X3,y3))

plot_model(X1, y1, dt1)
plt.show()
plot_model(X2, y2, dt2)
plt.show()
plot_model(X3, y3, dt3)

In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=0, n_estimators=5)
random_forest_model.fit(new_X,new_y)
random_forest_model.score(new_X,new_y)

In [None]:
tree.plot_tree(dt1)
plt.show()
tree.plot_tree(dt2)
plt.show()
tree.plot_tree(dt3)

In [None]:
plot_model(new_X, new_y, random_forest_model)

In [None]:
for dt in random_forest_model.estimators_:
    tree.plot_tree(dt)
    plt.show()

In [None]:
for dt in random_forest_model.estimators_:
    plot_model(new_X, new_y, dt)
    plt.show()

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(random_state=0, n_estimators=6)
adaboost_model.fit(new_X, new_y)
adaboost_model.score(new_X, new_y)

In [None]:
plot_model(new_X, new_y, adaboost_model)

In [None]:
estimators = adaboost_model.estimators_
for estimator in estimators:
    plot_model(new_X, new_y, estimator)
    plt.show()

In [None]:
adaboost_model.estimator_weights_

# Gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gradient_boosting_model = GradientBoostingClassifier(random_state=0, n_estimators=5)
gradient_boosting_model.fit(new_X, new_y)
gradient_boosting_model.score(new_X, new_y)

In [None]:
plot_model(new_X, new_y, gradient_boosting_model)

In [None]:
estimators = gradient_boosting_model.estimators_
for i in range(len(estimators)):
    tree.plot_tree(estimators[i][0])
#    plot_model(new_X, new_y, estimators[i][0])

In [None]:
from sklearn import tree
tree.plot_tree(spam_decision_tree)

In [None]:
def draw_line(slope, y_intercept, color='grey', linewidth=0.7, starting=0,ending=4):     #G
    x = np.linspace(0, 8, 1000)
    plt.plot(x, y_intercept + slope*x, linestyle='-', color=color, linewidth=linewidth)