In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
def pad_ones_column(X):
    """Add columns of ones to a data matrix"""
    return np.c_[ np.ones(len(ORIGINAL_DF)), X ]

def estimate_beta(X, y):
    """Compte estimates for beta given X and y
    
    Beta = (X^T X)^-1 X^T y
    """
    t1 = np.linalg.inv(np.matmul(X.T, X))
    t2 = np.matmul(X.T, y)
    return np.matmul(t1, t2)

def run_complete_linear_regression(X, y, beta_hat):
    """Run linear regression and produce error statistics"""
    # Make predictions based on testing data
    predicted_y = np.matmul(X, beta_hat)
    
    # Compute the error from the actual data
    errors = np.square(y - predicted_y)
    
    # Compute mean squared error
    MSE = np.mean(errors)
    
    # Compute sum squared error
    SSE = np.sum(errors)

    return MSE, SSE

def run_k_fold(X, y, n_splits=10, print_stuff=False):
    """Run k-fold cross validation"""
    kf = KFold(n_splits=n_splits)
    storeResultsMSE = []
    storeResultsSSE = []

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        beta_hat = estimate_beta(X_train, y_train)

        test_MSE, test_SSE = run_complete_linear_regression(X_test, y_test, beta_hat)

        storeResultsMSE.append(test_MSE)
        storeResultsSSE.append(test_SSE)
        
        if print_stuff:
            print("Iteration:", i, "MSE: ", test_MSE, "SSE:", test_SSE)  
    if print_stuff:
        print("Average MSE:", np.mean(storeResultsMSE), "Average SSE:", np.mean(storeResultsSSE))
    
    return np.mean(storeResultsMSE)

In [3]:
from sklearn import datasets

In [4]:
diabetes = datasets.load_diabetes()
X = np.matrix(diabetes.data)
y = np.matrix(diabetes.target).T

In [5]:
run_k_fold(X, y, 10, True)

Iteration: 0 MSE:  27186.2494606 SSE: 1223381.22573
Iteration: 1 MSE:  26253.4576563 SSE: 1181405.59453
Iteration: 2 MSE:  29313.8384269 SSE: 1289808.89078
Iteration: 3 MSE:  27200.3084226 SSE: 1196813.57059
Iteration: 4 MSE:  25036.3190957 SSE: 1101598.04021
Iteration: 5 MSE:  27302.712211 SSE: 1201319.33728
Iteration: 6 MSE:  30986.7821607 SSE: 1363418.41507
Iteration: 7 MSE:  25838.6890476 SSE: 1136902.3181
Iteration: 8 MSE:  29982.644804 SSE: 1319236.37137
Iteration: 9 MSE:  24756.7550612 SSE: 1089297.22269
Average MSE: 27385.7756347 Average SSE: 1210318.09864


27385.775634650414

In [31]:
test_data = pd.read_csv("fixed_beer_data.csv")

In [32]:
test_data.columns

Index(['beer_style', 'beer_name', 'avg_palate', 'avg_aroma', 'avg_overall',
       'avg_taste', 'avg_appear', 'count', 'review_palate', 'review_aroma',
       'review_taste', 'review_appearance', 'review_overall',
       'review_profilename', 'brewery_id'],
      dtype='object')

In [33]:
y = np.matrix(test_data[['review_overall']])

In [34]:
X = np.matrix(test_data.drop([
    'beer_style',
    'beer_name',
    'count',
    'review_palate',
    'review_aroma',
    'review_taste',
    'review_appearance',
    'review_overall',
    'review_profilename',
    'brewery_id'
], axis = 1))

In [26]:
run_k_fold(X, y, 10, True)

Iteration: 0 MSE:  0.380749415182 SSE: 1427.81030693
Iteration: 1 MSE:  0.389149900631 SSE: 1459.31212737
Iteration: 2 MSE:  0.415899796538 SSE: 1559.62423702
Iteration: 3 MSE:  0.382379156874 SSE: 1433.92183828
Iteration: 4 MSE:  0.385648735633 SSE: 1446.18275862
Iteration: 5 MSE:  0.383382955534 SSE: 1437.68608325
Iteration: 6 MSE:  0.3918267125 SSE: 1469.35017187
Iteration: 7 MSE:  0.375169973281 SSE: 1406.8873998
Iteration: 8 MSE:  0.373794174049 SSE: 1401.72815268
Iteration: 9 MSE:  0.367544124607 SSE: 1378.29046727
Average MSE: 0.384554494483 Average SSE: 1442.07935431


0.384554494482787

In [30]:
test_data.to_csv("test_dat.csv")