In [6]:
import pandas as pd
import numpy as np

import urllib2

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from itertools import combinations
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

---
Revisiting the Baseball Dataset with Support Vector Machines
=====
***

#####A whole bunch of pre-processing we have already seen in the Baseball Homework

In [7]:
df = pd.read_csv('/Users/mrgholt/GADS-22-NYC/Datasets/Hitters.csv')
df.dropna(inplace = True)
df.League = pd.factorize(df.League)[0]
df.Division = pd.factorize(df.Division)[0]
df.NewLeague = pd.factorize(df.NewLeague)[0]
predictors = list(df.columns.values)
predictors.remove('Salary')
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit(df)
df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

---
##A brute force approach
###But limit the number of features for which we produce combinations for!!
---

In [8]:
#Introduce a new parameter 'max_num_features_to_try', so as to limit the number of features this function will
#use as a argument into combinations

def brute_force(data, target_variable, predictors, model, alpha_list = [1.0], degree_list = [3],
                max_num_features_to_try = 3):
    ''' brute_force is a simple function designed to:
    test every combination of predictors submitted in the predictors argument
    test all degrees of polynomial as submitted in the degree_list argument
    test a number of regularization parameters as submitted in the alpha_list argument
    
    model is the algorithm to be tested
     '''
    min_mse = 1e99
    test_size_split = 0.5

    #search over every combination of the predictors - using the itertools functionality
    for i in xrange(1, max_num_features_to_try + 1):
        #it's not a bad idea to put an indicator of progress into the code
        print '.',
        
        #build and test a model for each combination of predictors
        for j in combinations(predictors, i):
            
            test_predictors = list(j)
            
            #use train test split to get the training and test datasets, according to the parameter test_size_split
            X_train, X_test, y_train, y_test = train_test_split(data[test_predictors], \
                                                    data[target_variable], test_size=test_size_split, random_state=42)
            
            #Now search over all the polynomial degrees in the degree_list
            for degree in degree_list:
                
                #Make sure each model is regularized, and search over all alphas in the regularization list
                for a in alpha_list:
                    
                    #build the model - all parameteres being determined using the training data
                    clf = make_pipeline(PolynomialFeatures(degree), model(alpha = a))
                    dummy = PolynomialFeatures(degree).fit_transform(X_train)
                    
                    #fit the model
                    clf.fit(X_train, y_train)
                    
                    #Get the test set predictions
                    y_hat = clf.predict(X_test)
                    
                    #measure the mean squared error of the test set
                    mse = mean_squared_error(y_hat, y_test)
                    
                    #remember ALL information for the minimum
                    if mse < min_mse:
                        min_mse = mse
                        min_clf = clf
                        min_predictors = test_predictors
                        min_degree = degree
                        min_alpha = a
                        #unless you cannot afford to do this, it is always a good idea to remember the train, test
                        #datasets actually used to build your model
                        min_X_train = X_train
                        min_y_train = y_train
                        min_X_test = X_test
                        min_y_test = y_test
                        input_dimension = dummy.shape[1]
                    
    #return a tuple for the minimum model and parameters
    return (min_mse, min_clf, min_predictors, min_degree, min_alpha, 
            min_X_train, min_y_train, min_X_test, min_y_test, input_dimension)

def print_essential_results(results):
    print "MSE = {:5.7f}".format(results[0])
    print "Best predictors = ", results[2]
    print "Input dimension = ", results[9]
    print "Optimal degree polynomial = ", results[3]
    print "Optimal regularization value = ", results[4]

def get_degree_v_mse(results, model, degree_list=[3], ylog=1.0, ylim_low = 0.0, ylim_high=1.0):
    '''get_degree_v_mse uses the results list to produce a plot of degree vs mse for the training
    and test sets'''
    
    #use the regularization parameter you found in the brute force routine
    model_alpha = results[4]

    #set up the lists to accumulate the MSE's
    training_error = []
    testing_error = []
    
    #Search through each degree in the supplied degree_list
    for degree in degree_list:
        
        #Build the model
        clf = make_pipeline(PolynomialFeatures(degree), model())
        
        #Fit the model using the training data from the brute force routine
        clf.fit(results[5], results[6])
        
        #Accumulate the mse results for the training and test sets
        training_error.append(mean_squared_error(results[6], clf.predict(results[5])))
        testing_error.append(mean_squared_error(results[8], clf.predict(results[7])))
    
    #Plot the results
    fig = plt.figure(figsize=(5,5))
    ax = plt.subplot(111)
    ax.plot(np.array(degree_list), np.array(training_error), color = 'green', marker = 'o', label='Training Error')
    ax.plot(np.array(degree_list), np.array(testing_error), color = 'red', marker = 'o', label='Validation Error')
    ax.set_title("PLot of MSE vs Poylnomial Degree")
    if ylog:
        ax.set_yscale('Log')
        ax.set_ylabel('Log(MSE)')
    else:
        ax.set_ylabel('MSE')

    ax.set_ylim(ylim_low, ylim_high)
    ax.set_xlabel('Degree')
    ax.legend(loc='best')
    
def get_alpha_v_mse(results, model, alpha_list=[0.1], ylog=1.0, ylim_low = 0.0,ylim_high=1.0):
    '''get_alpha_v_mse uses the results list to produce a plot of regularization vs mse for the training
    and test sets'''
    
    #use the polynomial degree you found in the brute force routine
    max_degree = results[3]

    #set up the lists to accumulate the MSE's
    training_error = []
    testing_error = []
    
    #Search through each regularization parameter in the supplied alpha_list
    for a in alpha_list:
        
        #build the model
        clf = make_pipeline(PolynomialFeatures(max_degree), model(alpha=a))
        
        #fit the model using the training set used in the brute force routine
        clf.fit(results[5], results[6])
        
        #accumulate the mse for the training and test sets
        training_error.append(mean_squared_error(results[6], clf.predict(results[5])))
        testing_error.append(mean_squared_error(results[8], clf.predict(results[7])))
    
    
    #plot the results
    fig = plt.figure(figsize=(5,5))
    ax = plt.subplot(111)
    ax.plot(np.array(alpha_list), np.array(training_error), color = 'green', marker = 'o', label='Training Error')
    ax.plot(np.array(alpha_list), np.array(testing_error), color = 'red', marker = 'o', label='Validation Error')
    ax.set_title("PLot of Regularization vs Poylnomial Degree")
    if ylog:
        ax.set_yscale('Log')
        ax.set_ylabel('Log(MSE)')
    else:
        ax.set_ylabel('MSE')

    ax.set_ylim(ylim_low, ylim_high)
    ax.set_xscale('Log')
    ax.set_xlabel('Log Alpha')
    ax.legend(loc='best')
    
def plot_results(results):
    '''plots out the the y_test predictions (y_hat) vs the actual, known results'''
    
    mpl.style.use('ggplot')
    half_points = len(results[8])/2

    fig = plt.figure(figsize=(30, 15))
    ax = plt.subplot(311)
    ax.plot(results[8][:half_points], color = 'blue', marker='o')
    ax.plot(results[1].predict(results[7])[:half_points], color="red", marker='o')
    
    ax = plt.subplot(312)
    ax.plot(results[8][half_points:], color = 'blue', marker='o')
    ax.plot(results[1].predict(results[7])[half_points:], color="red", marker='o')
    
    max_points_to_display = 100
    yy = np.ones(len(results[8]))
    thigh = results[1].predict(results[7]) + (yy * np.sqrt(results[0]))
    tlow = results[1].predict(results[7]) - (yy * np.sqrt(results[0]))
    x_plot = np.arange(0, len(results[8]))

    ax = plt.subplot(313)
    ax.plot(results[8][:max_points_to_display], color = 'blue', marker='.')
    ax.plot(results[1].predict(results[7])[:max_points_to_display], color="red", marker='.', alpha=0.35)
    ax.fill_between(x_plot[:max_points_to_display], thigh[:max_points_to_display],\
                tlow[:max_points_to_display], color='k', alpha=.25)

In [30]:
np.logspace(-10,4, 10)

array([  1.00000000e-10,   3.59381366e-09,   1.29154967e-07,
         4.64158883e-06,   1.66810054e-04,   5.99484250e-03,
         2.15443469e-01,   7.74263683e+00,   2.78255940e+02,
         1.00000000e+04])

---
#Try Ridge
---

In [23]:
#Let's call out brute force function and see if we can find a good model
#This may take an hour or two to run with the parameters below!
#ridge_results = brute_force(df_scaled, 
#                        'Salary', 
#                        predictors, 
#                        Ridge, 
#                        alpha_list=np.logspace(-10, 4, 10), 
#                        degree_list = [2, 3, 4],
#                        max_num_features_to_try = 5)

. . . . .


In [24]:
#print_essential_results(ridge_results)

MSE = 0.3729795
Best predictors =  ['AtBat', 'Walks', 'Years', 'CRBI', 'NewLeague']
Input dimension =  126
Optimal degree polynomial =  4
Optimal regularization value =  7.74263682681


In [26]:
pd.DataFrame({'actual' : ridge_results[8], 'predicted' : ridge_results[1].predict(ridge_results[7])}).corr()

Unnamed: 0,actual,predicted
actual,1.0,0.851834
predicted,0.851834,1.0


---
#Try Lasso
---

In [27]:
#Let's call out brute force function and see if we can find a good model
#This may take an hour or two to run with the parameters below!
#lasso_results = brute_force(df_scaled, 
#                        'Salary', 
#                        predictors, 
#                        Lasso, 
#                        alpha_list=np.logspace(-10, 4, 10), 
#                        degree_list = [2,  3, 4],
#                        max_num_features_to_try = 5)

. . . . .


In [28]:
#print_essential_results(lasso_results)

MSE = 0.3470978
Best predictors =  ['AtBat', 'Walks', 'Years', 'CHits', 'CRBI']
Input dimension =  126
Optimal degree polynomial =  4
Optimal regularization value =  0.00599484250319


In [34]:
pd.DataFrame({'actual' : lasso_results[8], 'predicted' : lasso_results[1].predict(lasso_results[7])}).corr()

Unnamed: 0,actual,predicted
actual,1.0,0.839877
predicted,0.839877,1.0


In [9]:
#Introduce a new parameter 'max_num_features_to_try', so as to limit the number of features this function will
#use as a argument into combinations

def brute_force_SVR(data, target_variable, predictors, C_list = [1.0], gamma_list = [1.0],
                max_num_features_to_try = 3):
    ''' brute_force is a simple function designed to:
    test every combination of predictors submitted in the predictors argument
    test all degrees of polynomial as submitted in the degree_list argument
    test a number of regularization parameters as submitted in the alpha_list argument
    
    model is the algorithm to be tested
     '''
    min_mse = 1e99
    test_size_split = 0.5

    #search over every combination of the predictors - using the itertools functionality
    for i in xrange(1, max_num_features_to_try + 1):
        #it's not a bad idea to put an indicator of progress into the code
        print '.',
        
        #build and test a model for each combination of predictors
        for j in combinations(predictors, i):
            
            test_predictors = list(j)
            
            #use train test split to get the training and test datasets, according to the parameter test_size_split
            X_train, X_test, y_train, y_test = train_test_split(data[test_predictors], \
                                                    data[target_variable], test_size=test_size_split, random_state=42)
            
            
            ########-------------------------------
            
            ########TYPE IN EXERCISE
            
            
            ########-------------------------------
                    
                    #measure the mean squared error of the test set
                    mse = mean_squared_error(y_hat, y_test)
                    
                    #remember ALL information for the minimum
                    if mse < min_mse:
                        min_mse = mse
                        min_clf = clf
                        min_predictors = test_predictors
                        min_C = C_param
                        min_gamma = gamma_param
                        #unless you cannot afford to do this, it is always a good idea to remember the train, test
                        #datasets actually used to build your model
                        min_X_train = X_train
                        min_y_train = y_train
                        min_X_test = X_test
                        min_y_test = y_test
                        input_dimension = X_train.shape[1]
                    
    #return a tuple for the minimum model and parameters
    return (min_mse, min_clf, min_predictors, min_C, min_gamma, 
            min_X_train, min_y_train, min_X_test, min_y_test, input_dimension)

In [None]:
#svr_results = brute_force_SVR(df_scaled, 
#                            'Salary', 
#                            predictors,  
#                            C_list = [100.0, 500.0, 750.0], 
#                            gamma_list = [0.025, 0.05, 0.1],
#                            max_num_features_to_try = 5)

. . 

In [51]:
#print_essential_results(svr_results)

MSE = 0.3671368
Best predictors =  ['AtBat', 'Walks', 'Years', 'CRBI', 'League']
Input dimension =  5
Optimal degree polynomial =  500.0
Optimal regularization value =  0.05


In [52]:
#pd.DataFrame({'actual' : svr_results[8], 'predicted' : svr_results[1].predict(svr_results[7])}).corr()

Unnamed: 0,actual,predicted
actual,1.0,0.832005
predicted,0.832005,1.0
