Using R or Python, read in spam dataset, and split into training and test sets. Increasing the number of features used for ananlysis to the first 10 features, implement all the classifiers shown above. Compare the accuracy and MSE of each method. (Note that MNL in this case is just the logit classifier.) You should do this as a separate R file or Jupyter notebook.

In [30]:
# Load the libraries.  Everyone should pip install tabulate.  
# Anaconda automatically installs sklearn.

%matplotlib inline
import warnings
import numpy as np
import pandas as pd
from numpy import random
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import Quandl
import statsmodels as sm
import statsmodels.formula.api as smf
import statsmodels.api as sma
import patsy
from statsmodels.graphics.api import abline_plot
import numpy.linalg as linalg
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from tabulate import tabulate
warnings.simplefilter('ignore')
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from __future__ import division

In [2]:
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/"
              "spambase/spambase.data")

# Read spam data from UCI repository.
spam = pd.read_csv(target_url, header=None, prefix="v")

# Note that R indexes from 1, while Python indexes from 0.
# Spam indicator is v57.  Rename it.
spam.rename(columns={'v57':'spam'}, inplace=True)

In [3]:
spam.describe()

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v48,v49,v50,v51,v52,v53,v54,v55,v56,spam
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285,0.394045
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851,0.488698
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [6]:
def randomPartition(dataframe, partitions):
    groups = dataframe.groupby(np.random.randint(0,partitions,size=len(dataframe)))
    return [groups.get_group(i) for i in range(partitions)]

In [7]:
test, train = randomPartition(spam, 2)

In [11]:
features = ['v' + str(i) for i in range(10)]

In [35]:
train_x = train.as_matrix(features)
train_y = train['spam'].values
knn = neighbors.KNeighborsClassifier(np.sqrt(len(spam)), 'distance')
knn.fit(train_x, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=67.830671528446487, p=2,
           weights='distance')

In [36]:
def results(ytest, ypred):
    cm = confusion_matrix(ytest, ypred)
    print cm
    KNNcm = cm
    MSE = mean_squared_error(ytest, ypred)

    ACC = (cm[0][0] + cm[1][1]) / (len(ypred))

    table = [[" ","0", "1"],
             ["0", cm[0][0], cm[0][1]], 
             ["1", cm[1][0], cm[1][1]]]
    print("The confusion matrix is:")
    print(tabulate(table, tablefmt="fancy_grid", numalign = "center"))
    print("The Accuracy Rate is", ACC)
    print("The Mean Squared Error is", MSE)

In [37]:
test_y = test['spam'].values
test_x = test.as_matrix(features)
test_pred = knn.predict(test_x)

In [38]:
results(test_y, test_pred)

[[613 722]
 [216 691]]
The confusion matrix is:
╒═══╤═════╤═════╕
│   │  0  │  1  │
├───┼─────┼─────┤
│ 0 │ 613 │ 722 │
├───┼─────┼─────┤
│ 1 │ 216 │ 691 │
╘═══╧═════╧═════╛
('The Accuracy Rate is', 0.58162355040142732)
('The Mean Squared Error is', 0.41837644959857273)


In [39]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 30)
rf.fit(train_x, train_y)
test_pred = rf.predict(test_x)
results(test_y, test_pred)

[[1226  109]
 [ 241  666]]
The confusion matrix is:
╒═══╤══════╤═════╕
│   │  0   │  1  │
├───┼──────┼─────┤
│ 0 │ 1226 │ 109 │
├───┼──────┼─────┤
│ 1 │ 241  │ 666 │
╘═══╧══════╧═════╛
('The Accuracy Rate is', 0.84388938447814454)
('The Mean Squared Error is', 0.15611061552185548)


In [40]:
from sklearn.lda import LDA

lda = LDA()
lda.fit(train_x, train_y)
test_y = lda.predict(test_x)
results(test_y, test_pred)

[[1371  305]
 [  96  470]]
The confusion matrix is:
╒═══╤══════╤═════╕
│   │  0   │  1  │
├───┼──────┼─────┤
│ 0 │ 1371 │ 305 │
├───┼──────┼─────┤
│ 1 │  96  │ 470 │
╘═══╧══════╧═════╛
('The Accuracy Rate is', 0.82114183764495985)
('The Mean Squared Error is', 0.17885816235504015)
