In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Additionally added libraries

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
#### Remember to unpack data from .zip folders

Investigating loaded data

In [3]:
tX

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [4]:
tX.shape
print(tX[0:10,22])

[2. 1. 1. 0. 0. 3. 2. 1. 0. 1.]


In [5]:
# Setting -999 to NaN values
tX[tX==-999] = np.nan
tX
print(tX[0:10,22])

[2. 1. 1. 0. 0. 3. 2. 1. 0. 1.]


In [6]:
# Numerical exploratory data analysis
# row1: mean | row2: variance | row3: standard deviation | row4: minimum value | row5: maximum value | row6: number of NaN's
feature_details = np.zeros([7, tX.shape[1]])
for i in range(tX.shape[1]):
    feature_details[0, i] = np.nanmean(tX[:,i])
    feature_details[1, i] = np.nanvar(tX[:,i])
    feature_details[2, i] = np.nanstd(tX[:,i])
    feature_details[3, i] = np.nanmin(tX[:,i])
    feature_details[4, i] = np.nanmax(tX[:,i])
    feature_details[5, i] = np.isnan(tX[:,i]).sum()
    feature_details[6, i] = np.nanmedian(tX[:,i])
#.... I dont know how to represent this in a nice way without pandas.DataFrame, without doing a lot of coding...
#print(feature_details)

In [7]:
# Importing pandas just to make this numerical EDA nicer to view. 
# IMPORTANT!!! DO NOT USE IN PROJECT SUBMISSION
import pandas as pd
df = pd.DataFrame(feature_details)
df.index = ['Mean', 'Variance', 'Std', 'min', 'max', 'n-NaNs', 'median']
df.index.name = 'Statistics'
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Statistics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591
Variance,3283.063262,1249.255942,1666.975303,4052.029594,3.035311,158162.573194,12.847474,0.612947,496.106539,13387.851528,...,3.284138,16002.060938,0.955358,3679.887218,3.184583,3.288345,1023.076126,4.127921,3.301261,9607.031571
Std,57.298021,35.344815,40.828609,63.655554,1.742214,397.696584,3.584337,0.78291,22.273449,115.705884,...,1.812219,126.499253,0.977424,60.662074,1.78454,1.813379,31.985561,2.031729,1.816937,98.015466
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433
n-NaNs,38114.0,0.0,0.0,0.0,177457.0,177457.0,177457.0,0.0,0.0,0.0,...,0.0,0.0,0.0,99913.0,99913.0,99913.0,177457.0,177457.0,177457.0,0.0
median,112.406,46.524,73.752,38.4675,2.107,225.885,-0.244,2.4915,12.3155,120.6645,...,-0.024,179.739,1.0,65.561,0.0,-0.033,47.902,-0.01,-0.002,40.5125


We have to standardize dataset - a wide variety of ranges can be observed in the dataset. Would lead to a biased learning algorithm

In [8]:
# store mean and variance
feature_mean = feature_details[0, :]
feature_std = feature_details[2, :]
feature_median = feature_details[6, :]

In [26]:
#Cleaning nans
from clean_nan import *

tx_ = clean_nan(tX, feature_median)

In [27]:
# create standardized dataset
#should be put into a method
for i in range(tx_.shape[1]):
    if (i!=22):
        tx_[:,i] = (tx_[:,i] - feature_mean[i])/feature_std[i]

print(tx_)

[[-1.64971287e-01 -1.43248771e+00 -2.03681582e+00 ...  1.57963171e-01
  -4.11010970e-01 -7.53001799e-01]
 [-1.64971287e-01 -7.68378412e-02 -1.81979789e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.53073239e-01]
 [-1.64971287e-01 -7.68378412e-02 -1.81979789e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.53075336e-01]
 ...
 [-1.64971287e-01 -7.68378412e-02 -1.81979789e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.53077735e-01]
 [-1.64971287e-01 -7.68378412e-02 -1.81979789e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.53122330e-01]
 [-1.64971287e-01 -7.68378412e-02 -1.81979789e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.53122330e-01]]


In [28]:
tx_[0:10,22]

array([2., 1., 1., 0., 0., 3., 2., 1., 0., 1.])

In [29]:
# Double checking array
tx_.shape

(250000, 30)

In [30]:
# Saving arrays to .py file. Can easily be loaded with np.load('path' + 'filename')
####np.save('tX_cleaned', tX)
####np.save('tX_standardized', tX_standardized)
# Commented this section out to prevent overwriting of dataset

## Do your thing crazy machine learning thing here :) ...

### Linear regression using gradient descent

In [32]:
#from least_squares_GD import *
from costs import compute_mse
from least_squares_GD import *
from parameter_tuning import gamma_tuning_SGD
from parameter_tuning import gamma_tuning_GD

# Initialization of the weights
initial_w = np.zeros(tx_.shape[1])

# Define the parameters necessary for gradient descent: need to tune gamma. 
# we use the gamma_tuning_SGD for less costly tuning
max_iters = 50
gamma = gamma_tuning_GD(y, tx_, initial_w, max_iters)

w1, loss1 = least_squares_GD(y, tx_, initial_w, max_iters, gamma)
#print(w1)
#print(loss1)

"""w1_ = w1.copy()
print(w1_)
for i in range(len(w1)):
    w1_[i] = w1_[i]*feature_std[i] +  feature_mean[i]
    """

'w1_ = w1.copy()\nprint(w1_)\nfor i in range(len(w1)):\n    w1_[i] = w1_[i]*feature_std[i] +  feature_mean[i]\n    '

In [16]:
print(loss1)

0.9976909649072614


### Least squares with SGD

In [20]:
from least_squares_SGD import *
#No need to re-tune gamma BUT NOT SURE
#Here batch_size is set at 1


# Initialization of the weights BUT we could use w1 as a start AND careful with tuning
initial_w = np.zeros(tx_.shape[1])
gamma = gamma_tuning_GD(y, tx_, initial_w, max_iters)

w2, loss2 = least_squares_SGD(y, tx_, initial_w, 1, max_iters, gamma)
#print(w2)
print(loss2)
w2_ = w2.copy()
print(w1_)
for i in range(len(w2)):
    w2_[i] = w2_[i]*feature_std[i] +  feature_mean[i]

0.9970194269760377


### Least Squares using Normal Equations

In [33]:
from least_squares import *
from build_polynomial import *
from parameter_tuning import degree_tuning_LS

# degree = degree_tuning_LS(y , tx_)
# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1]) 

#poly_x = build_poly(tx_, degree)

w3, loss3 = least_squares(y, tx_)
#_, loss3_expand = least_squares(y, poly_x)
print(loss3)
#print(loss3_expand)

"""w3_ = w3.copy()
for i in range(len(w3)):
    w3_[i] = w3_[i]*feature_std[i] +  feature_mean[i]
    """

0.9004464774102637


'w3_ = w3.copy()\nfor i in range(len(w3)):\n    w3_[i] = w3_[i]*feature_std[i] +  feature_mean[i]\n    '

In [29]:
print(degree)

10


### Ridge regression

In [23]:
from ridge_regression import *
from parameter_tuning import lambda_tuning_ridge

# Tuning of lambda
lambda_ = lambda_tuning_ridge(y, tx_)
print(lambda_)

# Computing the loss
w4, loss4 = ridge_regression(y, tx_, lambda_)
print(loss4)
w4_ = w4.copy()
for i in range(len(w1)):
    w4_[i] = w4_[i]*feature_std[i] +  feature_mean[i]

3.831186849557293e-11
0.949203122152439


### Logistic Regression

In [24]:
from logistic_regression import *
from parameter_tuning import gamma_tuning_log

# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1])
# Tuning of gamma
#gamma = gamma_tuning_log(y, tx_, initial_w, max_iters)
gamma=0.0000000000000001
w5, loss5 = logistic_regression(y, tx_, initial_w, max_iters, gamma)
print(w5)
print(loss5)
w5_ = w5.copy()
for i in range(len(w5)):
    w5_[i] = w5_[i]*feature_std[i] +  feature_mean[i]

(250000, 30)
[ 2.16573262e-09  1.41866310e-09  2.02480684e-09  9.26190885e-10
  1.40499103e-09  9.51977365e-10 -2.33446140e-10  3.08668956e-09
  8.64890158e-10  1.39436538e-09  1.73302866e-09 -1.09465863e-10
  1.17059266e-09  1.75873919e-09 -9.20385699e-12 -4.58004931e-12
  2.15344440e-09 -1.57038417e-11  2.44087416e-11  1.29145439e-09
 -5.68622019e-12  1.68888554e-09  1.02015495e-09  1.42390274e-09
 -1.86861070e-12 -6.95937415e-12  1.83635167e-09 -5.93700744e-12
 -8.86817921e-13  7.59103311e-10]
173286.78524797945


### Unstandardize the weights

In [47]:
w1_ = w1.copy()
print(w1_)
for i in range(len(w1)):
    w1_[i] = w1_[i]*feature_std[i] +  feature_mean[i]

print(w1_)

[ 2.45108232e-06 -3.43700133e-05 -5.44718014e-06  1.14740597e-05
  4.73573474e-05  4.56846562e-05 -4.24550044e-05  2.20708422e-06
 -4.00225899e-05 -1.72940230e-05 -2.87181563e-05  2.08680153e-05
  4.44417221e-05  2.61484975e-05 -1.78429894e-06 -3.57904831e-07
 -1.22306800e-05 -1.72494696e-06  5.92511254e-07  2.17852349e-06
  1.28490515e-06 -2.31919542e-05 -3.37104443e-05  3.06211246e-07
  5.29120741e-07  2.49389382e-07 -3.26290649e-06  1.71041410e-07
 -9.07921975e-07  6.37310733e-05]
[ 1.21858669e+02  4.92386045e+01  8.11817592e+01  5.78966920e+01
  2.40381753e+00  3.71801529e+02 -8.21840344e-01  2.37310157e+00
  1.89164410e+01  1.58430216e+02  1.43758517e+00 -1.28279800e-01
  4.58307518e-01  3.87080052e+01 -1.09752143e-02 -8.17172223e-03
  4.66599374e+01 -1.95096500e-02  4.35440404e-02  4.17173062e+01
 -1.01168635e-02  2.09794244e+02  9.79143051e-01  8.48221231e+01
 -3.27364317e-03 -1.23923733e-02  5.76793701e+01 -1.18449167e-02
 -1.58393876e-03  7.30708380e+01]


# Predictions

In [26]:
## Generate predictions and save ouput in csv format for submission:

In [34]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [42]:
# STANDARDIZE TX TEST
test_mean = np.zeros(tX_test.shape[1]).T
test_std = np.zeros(tX_test.shape[1]).T
for i in range(tX_test.shape[1]):
    test_mean[i] = np.nanmean(tX_test[:,i])
    test_std[i] = np.nanstd(tX_test[:,i])
for i in range(tX_test.shape[1]):
    if (i!=22):
        tX_test[:,i] = (tX_test[:,i] - test_mean[i])/test_std[i]

In [44]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w3, tX_test) # CAREFUL IT'S W3
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [38]:
weights = w2

In [39]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)