In [14]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [15]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

In [62]:
from franksun_function import *
from costs import *
from numpy import float64 as f64
from helpers import *


# This is the using the whole test data to compute the least_squares
optimal_weight = least_squares(y, tX)
train_err = compute_loss(y, tX, optimal_weight)
print("optimal weight: ", optimal_weight)
print("train error: ", train_err)
print("train error in root sqrt: ", np.sqrt(2 * train_err))

# This is using the bias variance method
# Various training/testing data ratio
ratios = np.arange(0.1, 0.9, 0.1, dtype=f64)
seeds = np.array([1, 6, 68, 20, 42])

data_split = []

rows, cols = np.indices((ratios.shape[0], seeds.shape[0]))

for ratio in ratios:
    for seed in seeds:
        data_split.append(((ratio, seed), (split_data(tX, y, ratio, seed))))


stat_dict = dict()
for data in data_split:
    (ratio, seed), (x_train, y_train, x_test, y_test) = data
    weight = least_squares(y_train, x_train)
    train_err = compute_loss(y_train, x_train, weight)
    test_err = compute_loss(y_test, x_test, weight)
    
    stat_dict[(ratio, seed)] = (weight, train_err, test_err)
#     print("Ratio ", ratio, " with seed ", seed, " has:")
#     print("\tTrain Error in root square: ", np.sqrt(2 * train_err))
#     print("\tTest Error in root square: ", np.sqrt(2 * test_err))


max_train_err = np.max([train_err if ratio <= 0.7 else 0 for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()])
min_train_err = np.min([train_err if ratio <= 0.7 else 10 for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()])
max_test_err = np.max([test_err if ratio <= 0.7 else 0 for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()])
min_test_err = np.min([test_err if ratio <= 0.7 else 10 for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()])
average_weight = np.average([weight for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()])

weights = [weight for (ratio, seed), (weight, train_err, test_err) in stat_dict.items()]

print(max_test_err)
print(min_test_err)
print(max_train_err)
print(min_train_err)
    


optimal weight:  [  8.03494350e-05  -7.20202266e-03  -6.05417274e-03  -5.47559077e-04
  -1.93874701e-02   4.73451615e-04  -2.60379061e-02   3.25106299e-01
  -3.80780002e-05  -2.72787186e+00  -2.21220141e-01   9.50794097e-02
   6.40351625e-02   2.73613154e+00  -3.31801108e-04  -9.54325151e-04
   2.74088828e+00  -5.34165287e-04   9.73498892e-04   3.69225050e-03
   3.54487160e-04  -5.43344617e-04  -3.30448034e-01  -1.40800496e-03
   8.31432873e-04   1.02117271e-03  -1.68047418e-03  -5.83664769e-03
  -1.11088005e-02   2.72833179e+00]
train error:  0.339686809477
train error in root sqrt:  0.824241238324
0.341702726908
0.33917265009
0.341919243822
0.336025433365


## Generate predictions and save ouput in csv format for submission:

In [63]:
DATA_TEST_PATH = '../../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

test_err = compute_loss(_, tX_test, optimal_weight)
print("weight: ", optimal_weight)
print("test error: ", test_err)
print("test error in root sqrt: ", np.sqrt(2 * test_err))

for weight in weights:
    test_err = compute_loss(_, tX_test, weight)
    print("weight: ", weight)
    print("test error: ", test_err)
    print("test error in root sqrt: ", np.sqrt(2 * test_err))

weight:  [  8.03494350e-05  -7.20202266e-03  -6.05417274e-03  -5.47559077e-04
  -1.93874701e-02   4.73451615e-04  -2.60379061e-02   3.25106299e-01
  -3.80780002e-05  -2.72787186e+00  -2.21220141e-01   9.50794097e-02
   6.40351625e-02   2.73613154e+00  -3.31801108e-04  -9.54325151e-04
   2.74088828e+00  -5.34165287e-04   9.73498892e-04   3.69225050e-03
   3.54487160e-04  -5.43344617e-04  -3.30448034e-01  -1.40800496e-03
   8.31432873e-04   1.02117271e-03  -1.68047418e-03  -5.83664769e-03
  -1.11088005e-02   2.72833179e+00]
test error:  0.97275879473
test error in root sqrt:  1.39481812057
weight:  [  7.78337889e-05  -7.11211729e-03  -6.13485000e-03  -5.91749429e-04
  -1.85697406e-02   4.74066467e-04  -2.58630320e-02   3.29037018e-01
  -2.47459926e-04  -1.84895951e+00  -2.21742587e-01   9.62865591e-02
   6.33782407e-02   1.85744954e+00  -1.05518986e-03  -5.60180973e-04
   1.86213022e+00  -8.40036419e-04  -1.65184987e-04   3.62360956e-03
  -8.93313375e-04  -5.11908638e-04  -3.42211923e-01

In [65]:
weights = optimal_weight
OUTPUT_PATH = '../../data/least_squares_sample_output.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)