In [1]:
"""
This code shows an example implementing the fast gradient algorithm to compute
the beta coefficients and misclassification error. It compares the beta
coefficients to that of sklearn.
"""


import src.logistic_reg as lreg
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# load dataset and drop NAs
spam = pd.read_table('https://statweb.stanford.edu/~tibs/ElemStatLearn/'
                     'datasets/spam.data', sep=' ', header=None)
spam = spam.dropna()
test_indicator = pd.read_table('https://statweb.stanford.edu/~tibs/'
                               'ElemStatLearn/datasets/spam.traintest',
                               sep=' ', header=None)

# declare data and labels
x_data = np.asarray(spam.drop(57, axis=1))
y_data = np.asarray(spam[57])*2 - 1
test_indicator = np.ravel(np.asarray(test_indicator))

# define the split between train and test data
x_train = x_data[test_indicator == 0, :]
x_test = x_data[test_indicator == 1, :]
y_train = y_data[test_indicator == 0]
y_test = y_data[test_indicator == 1]

# standardize the data
x_scaler = StandardScaler().fit(x_train)
x_train = x_scaler.transform(x_train)
x_test = x_scaler.transform(x_test)
n = x_train.shape[0]
d = x_train.shape[1]

# initialize the beta and theta values
beta_init = np.zeros(d)
theta_init = np.zeros(d)

# run the fast gradient algorithm to find the beta coefficients
fastgrad_betas = lreg.fastgradalgo(beta_init=beta_init,
                                   theta_init=theta_init,
                                   lamb=0.1,
                                   x=x_train,
                                   y=y_train,
                                   max_iter=1000)[-1]

# run sci-kit learn's LogisticRegression() to find the beta coefficients
logit = LogisticRegression(C=1/(2*n*0.1),
                           fit_intercept=False,
                           tol=1e-8).fit(x_train, y_train)

# print the coefficients found using the fast gradient algorithm and sklearn
print("\nFast Gradient Algorithm Coefficients:\n", fastgrad_betas)
print("\nSci-kit Learn's LogisticRegression() Coefficients:\n", logit.coef_)

# apply the coefficients found using the fast gradient algorithm to test set
y_predict = (np.dot(x_test, fastgrad_betas) > 0)*2 - 1

# print the misclassification error
print("\nMisclassification Error: %.2f%%" % (np.mean(y_predict != y_test)*100))



Fast Gradient Algorithm Coefficients:
 [ 0.02117345 -0.03877587  0.09773771  0.05503818  0.15256401  0.13534365
  0.28352681  0.15114461  0.11280873  0.06183422  0.10502022 -0.04219962
  0.03472887  0.02859174  0.10917878  0.27363777  0.17373109  0.12517347
  0.1290986   0.12243222  0.22405219  0.10833277  0.24549208  0.16655949
 -0.14754037 -0.10493049 -0.1149898  -0.06087818 -0.04741795 -0.06626914
 -0.02882511 -0.0151532  -0.07621369 -0.01594251 -0.04245255 -0.01357162
 -0.07700499 -0.03415877 -0.07954034  0.01733619 -0.04772963 -0.09082758
 -0.0600434  -0.06969745 -0.11271605 -0.11263913 -0.0321344  -0.06206727
 -0.05727232 -0.0425365  -0.02823825  0.15484338  0.26205746  0.05893875
  0.068719    0.12509357  0.14818851]

Sci-kit Learn's LogisticRegression() Coefficients:
 [[ 0.02117345 -0.03877587  0.09773771  0.05503819  0.15256401  0.13534366
   0.28352681  0.15114461  0.11280873  0.06183421  0.10502022 -0.04219962
   0.03472887  0.02859174  0.10917878  0.27363777  0.17373109  0