### Using regularized logistic regression to classify email

In [2]:
import scipy.io
import utils
import numpy as np
from sklearn import linear_model

# No modifications in this script
# complete the functions in util.py; then run the script

# load the spam data in

Xtrain,Xtest,ytrain,ytest = utils.load_spam_data()

# Preprocess the data 

Xtrain_std,mu,sigma = utils.std_features(Xtrain)
Xtrain_logt = utils.log_features(Xtrain)
Xtrain_bin = utils.bin_features(Xtrain)

Xtest_std = (Xtest - mu)/sigma
Xtest_logt = utils.log_features(Xtest)
Xtest_bin = utils.bin_features(Xtest)

# find good lambda by cross validation for these three sets

def run_dataset(X,ytrain,Xt,ytest,type,penalty):

    best_lambda = utils.select_lambda_crossval(X,ytrain,0.1,5.1,0.5,penalty)
    print "best_lambda = ", best_lambda

    # train a classifier on best_lambda and run it
    if penalty == "l2":
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='lbfgs',fit_intercept=True)
    else:
        lreg = linear_model.LogisticRegression(penalty=penalty,C=1.0/best_lambda, solver='liblinear',fit_intercept=True)
    lreg.fit(X,ytrain)
    print "Coefficients = ", lreg.intercept_,lreg.coef_
    predy = lreg.predict(Xt)
    print "Accuracy on set aside test set for ", type, " = ", np.mean(predy==ytest)

print "L2 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l2")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l2")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l2")

print "L1 Penalty experiments -----------"
run_dataset(Xtrain_std,ytrain,Xtest_std,ytest,"std","l1")
run_dataset(Xtrain_logt,ytrain,Xtest_logt,ytest,"logt","l1")
run_dataset(Xtrain_bin,ytrain,Xtest_bin,ytest,"bin","l1")

L2 Penalty experiments -----------
best_lambda =  0.1
Coefficients =  [-4.86311364] [[-2.74146453e-02 -2.25297590e-01  1.21840937e-01  2.29362873e+00
   2.70425714e-01  2.32851165e-01  9.28595395e-01  2.95200239e-01
   1.62205937e-01  6.78260459e-02 -8.32604430e-02 -1.60373355e-01
  -4.72247658e-02  1.07677122e-02  1.87903329e-01  8.19771813e-01
   5.09528969e-01  3.98711552e-02  2.67729697e-01  3.47047585e-01
   2.60498922e-01  3.64605177e-01  7.25019558e-01  1.96728251e-01
  -3.15395700e+00 -4.03133784e-01 -1.25451045e+01 -6.16581305e-02
  -1.56114612e+00 -5.51429725e-02 -3.00815305e-02  4.07263522e-01
  -3.68156440e-01 -1.43611777e+00 -5.87180486e-01  4.44294911e-01
   4.23159437e-02 -1.56897094e-01 -4.55330850e-01 -1.02250295e-01
  -3.54273293e+00 -1.72944491e+00 -4.37529284e-01 -1.05999941e+00
  -9.18599334e-01 -1.75490331e+00 -1.67475860e-01 -9.56875228e-01
  -3.65653126e-01 -1.36535504e-01 -6.58692477e-02  2.06714026e-01
   1.70694383e+00  1.21460315e+00 -3.35269845e-01  1.56141