In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import linear_model
from __future__ import division

In [2]:
df = pd.read_csv('hw2_college.csv')
df.rename(columns={'Unnamed: 0':'School'}, inplace=True)
df = df[['Apps','Private',  'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate']]
df = df.replace(['Yes', 'No'], [1, 0], )
keys = ['Private',  'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate']
df[keys] = (df[keys] - df[keys].mean()) / (df[keys].max() - df[keys].min())
#df = (df - df.mean()) / (df.max() - df.min())
msk = np.random.rand(len(df)) < 0.8
dtrain = df[msk]
dtest = df[~msk]
ytrain = np.array(dtrain['Apps'])
xtrain = np.array(dtrain.drop(['Apps'], axis=1))
ytest = np.array(dtest['Apps'])
xtest = np.array(dtest.drop(['Apps'], axis=1))
valid = 10

In [3]:
def linear_reg(xtrain, ytrain, xtest, ytest):
    #Linear regression 
    
    lstsq = linear_model.LinearRegression().fit(xtrain,ytrain)
    preds = np.dot(xtest,lstsq.coef_)
    err = mean_squared_error(ytest, preds)
    print 'mean squared error:', err
    print 'score:',lstsq.score(xtest,ytest)
    print '\n'

In [4]:
#ridge regression
def ridge_reg(xtrain, ytrain, xtest, ytest, valid,alphas):
    score=np.zeros((valid,len(alphas)))
    i = j = 0
    for a in alphas:
        kf = KFold(n_splits=valid)
        for train, test in kf.split(xtrain):
            kxtrain = xtrain[train]
            kytrain = ytrain[train]
            kxtest = xtrain[test]
            kytest = ytrain[test]
            clf = Ridge(alpha=a)
            clf.fit(kxtrain, kytrain)
            score[j][i] = clf.score(kxtest, kytest)
            j+=1
        i+=1
        j=0
    amax = alphas[ np.argmax(np.mean(score, axis=0))]

    clf = Ridge(alpha=amax)
    clf.fit(xtrain, ytrain)
    preds = clf.predict(xtest)
    err  = mean_squared_error(ytest, preds)
    print 'mean squared error', err , '\n'
    finalscore= clf.score(xtest, ytest)
    print 'cross validation scores:', '\n', score , '\n'
    print 'final score:', finalscore , '\n'
    print 'lambda:', amax, '\n'
    print 'coefficients:', clf.coef_
    print '\n'


In [5]:
def lasso_reg(xtrain, ytrain, xtest, ytest, valid,alphas):#lasso regression
    score=np.zeros((valid,len(alphas)))
    i = j = 0
    for a in alphas:
        kf = KFold(n_splits=valid)
        for train, test in kf.split(xtrain):
            kxtrain = xtrain[train]
            kytrain = ytrain[train]
            kxtest = xtrain[test]
            kytest = ytrain[test]
            clf = Lasso(alpha=a)
            clf.fit(kxtrain, kytrain)
            score[j][i] = clf.score(kxtest, kytest)
            j+=1
        i+=1
        j=0
    amax = alphas[ np.argmax(np.mean(score, axis=0))]
    clf = Lasso(alpha=amax)
    clf.fit(xtrain, ytrain)
    preds = clf.predict(xtest)
    err  = mean_squared_error(ytest, preds)
    print 'mean squared error', err , '\n'
    finalscore= clf.score(xtest, ytest)
    print 'cross validation scores:', '\n', score , '\n'
    print 'final score:', finalscore , '\n'
    print 'lambda:', amax, '\n'
    print 'coefficients:', clf.coef_
    print '\n'
    return clf.coef_



In [6]:
alphas = [0.00001,0.005,0.333]
print 'Linear Regression: \n'
linear_reg(xtrain, ytrain, xtest, ytest)
print 'Ridge Regression: \n'
ridge_reg(xtrain, ytrain, xtest, ytest, valid,alphas)
alphas = [5,8,10]
print 'Lasso Regression: \n'
lasso_reg(xtrain, ytrain, xtest, ytest, valid,alphas)

Linear Regression: 

mean squared error: 10089522.6243
score: 0.813739595358


Ridge Regression: 

mean squared error 1270343.46741 

cross validation scores: 
[[ 0.91813855  0.91827209  0.91071545]
 [ 0.87101665  0.87220819  0.90858833]
 [ 0.93631776  0.93685907  0.94496346]
 [ 0.89057278  0.89136794  0.9051158 ]
 [ 0.92905985  0.92873536  0.91220836]
 [ 0.90542681  0.90561364  0.91237731]
 [ 0.91173516  0.90924559  0.79718915]
 [ 0.93241621  0.93256529  0.91455448]
 [ 0.9046983   0.90573583  0.9274031 ]
 [ 0.91153628  0.91212639  0.92808486]] 

final score: 0.814664352464 

lambda: 0.005 

coefficients: [ -5.68631824e+02   4.19413403e+04  -7.54131181e+03   4.32574798e+03
  -1.20874877e+03   3.18131615e+03   1.33736800e+03  -1.81012841e+03
   6.21487775e+02  -1.76536043e+02  -9.16551852e+01  -8.71656765e+02
  -4.59034088e+02   1.05124022e+03  -1.31981355e+01   6.75291053e+03
   1.36483334e+03]


Lasso Regression: 

mean squared error 1051886.31593 

cross validation scores: 
[[ 0.9186

array([  -592.33976821,  36021.34372305,    -67.57958828,   2737.77281983,
           -0.        ,     -0.        ,      0.        ,   -753.58588806,
          592.39148324,      0.        ,      0.        ,   -388.65892283,
         -477.76955501,      0.        ,   -119.24508849,   4552.7477751 ,
          603.0093137 ])

In [7]:
#bootstrap lasso regression
B = 20
matrix = []
alphas = [5,8,10]
#random sample from df with replacement
for i in range(B):
    rows = np.random.choice(len(dtrain), size=len(dtrain), replace=True, p=None) <0.8*(len(dtrain))
    dtrainB = dtrain[rows]
    ytrain = np.array(dtrainB['Apps'])
    xtrain = np.array(dtrainB.drop(['Apps'], axis=1))
    ytest = np.array(dtest['Apps'])
    xtest = np.array(dtest.drop(['Apps'], axis=1))
    valid = 10
    matrix.append(np.array(lasso_reg(xtrain, ytrain, xtest, ytest, valid,alphas)))


mean squared error 987803.183767 

cross validation scores: 
[[ 0.91392866  0.91449271  0.91473819]
 [ 0.88638472  0.88986743  0.88762958]
 [ 0.94995994  0.95065852  0.94989306]
 [ 0.93098883  0.92574977  0.9234795 ]
 [ 0.87330687  0.87124653  0.86777697]
 [ 0.93351338  0.93354499  0.93313844]
 [ 0.89400692  0.88900572  0.88415618]
 [ 0.92856918  0.9299023   0.93077335]
 [ 0.89277473  0.89135296  0.88891383]
 [ 0.92361299  0.91157652  0.90487321]] 

final score: 0.85588531968 

lambda: 5 

coefficients: [  -544.46597708  37301.13844106   -226.41548551   2711.86978091     -0.
     -0.              0.           -633.22744668    479.37164045      0.
     -0.           -744.43911625     -0.             -0.           -321.4103577
   3384.85702524    770.60215579]


mean squared error 996174.075054 

cross validation scores: 
[[ 0.91867335  0.91741658  0.91695905]
 [ 0.8952561   0.90442041  0.90456951]
 [ 0.95809377  0.96128991  0.96101268]
 [ 0.78647099  0.77551817  0.76679213]
 [ 0.9254992

In [8]:
matrix = pd.DataFrame(matrix)
non_zeros = matrix[matrix!=0].count(axis=0)
non_zeros = pd.DataFrame(non_zeros)
labels = pd.Series(df[keys].columns)
non_zeros = pd.concat((non_zeros, labels), axis=1)
print non_zeros

     0            0
0   20      Private
1   20       Accept
2   10       Enroll
3   20    Top10perc
4    6    Top25perc
5    4  F.Undergrad
6    2  P.Undergrad
7   18     Outstate
8   20   Room.Board
9    0        Books
10   1     Personal
11  20          PhD
12  19     Terminal
13   3    S.F.Ratio
14  13  perc.alumni
15  20       Expend
16  20    Grad.Rate
