In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Additionally added libraries

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
#### Remember to unpack data from .zip folders

Investigating loaded data

In [3]:
tX

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [4]:
tX.shape

(250000, 30)

In [5]:
# Setting -999 to NaN values
tX[tX==-999] = np.nan
tX

array([[138.47 ,  51.655,  97.827, ...,   1.24 ,  -2.475, 113.497],
       [160.937,  68.768, 103.235, ...,     nan,     nan,  46.226],
       [    nan, 162.172, 125.953, ...,     nan,     nan,  44.251],
       ...,
       [105.457,  60.526,  75.839, ...,     nan,     nan,  41.992],
       [ 94.951,  19.362,  68.812, ...,     nan,     nan,   0.   ],
       [    nan,  72.756,  70.831, ...,     nan,     nan,   0.   ]])

In [6]:
# Numerical exploratory data analysis
# row1: mean | row2: variance | row3: standard deviation | row4: minimum value | row5: maximum value | row6: number of NaN's
feature_details = np.zeros([6, tX.shape[1]])
for i in range(tX.shape[1]):
    feature_details[0, i] = np.nanmean(tX[:,i])
    feature_details[1, i] = np.nanvar(tX[:,i])
    feature_details[2, i] = np.nanstd(tX[:,i])
    feature_details[3, i] = np.nanmin(tX[:,i])
    feature_details[4, i] = np.nanmax(tX[:,i])
    feature_details[5, i] = np.isnan(tX[:,i]).sum()
#.... I dont know how to represent this in a nice way without pandas.DataFrame, without doing a lot of coding...
#print(feature_details)

In [7]:
# Importing pandas just to make this numerical EDA nicer to view. 
# IMPORTANT!!! DO NOT USE IN PROJECT SUBMISSION
import pandas as pd
df = pd.DataFrame(feature_details)
df.index = ['Mean', 'Variance', 'Std', 'min', 'max', 'n-NaNs']
df.index.name = 'Statistics'
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Statistics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591
Variance,3283.063262,1249.255942,1666.975303,4052.029594,3.035311,158162.573194,12.847474,0.612947,496.106539,13387.851528,...,3.284138,16002.060938,0.955358,3679.887218,3.184583,3.288345,1023.076126,4.127921,3.301261,9607.031571
Std,57.298021,35.344815,40.828609,63.655554,1.742214,397.696584,3.584337,0.78291,22.273449,115.705884,...,1.812219,126.499253,0.977424,60.662074,1.78454,1.813379,31.985561,2.031729,1.816937,98.015466
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433
n-NaNs,38114.0,0.0,0.0,0.0,177457.0,177457.0,177457.0,0.0,0.0,0.0,...,0.0,0.0,0.0,99913.0,99913.0,99913.0,177457.0,177457.0,177457.0,0.0


We have to standardize dataset - a wide variety of ranges can be observed in the dataset. Would lead to a biased learning algorithm

In [8]:
# store mean and variance
feature_mean = feature_details[0, :]
feature_std = feature_details[2, :]

In [9]:
# create standardized dataset
#should be put into a method
tX_standardized = np.zeros(tX.shape)
for i in range(tX_standardized.shape[1]):
    tX_standardized[:,i] = (tX[:,i] - feature_mean[i])/feature_std[i]
print(tX_standardized)

[[ 0.28991353  0.06833197  0.40768027 ...  0.61614788 -1.36131161
   0.4125105 ]
 [ 0.68202131  0.55250482  0.54013641 ...         nan         nan
  -0.27381996]
 [        nan  3.19515553  1.09655998 ...         nan         nan
  -0.29396985]
 ...
 [-0.28624947  0.31931645 -0.13086367 ...         nan         nan
  -0.31701723]
 [-0.46960659 -0.84532397 -0.30297338 ...         nan         nan
  -0.74543941]
 [        nan  0.66533608 -0.25352276 ...         nan         nan
  -0.74543941]]


In [10]:
# Double checking array
tX_standardized.shape

(250000, 30)

In [11]:
#tX_standardized
np.isnan(tX_standardized)

array([[False, False, False, ..., False, False, False],
       [False, False, False, ...,  True,  True, False],
       [ True, False, False, ...,  True,  True, False],
       ...,
       [False, False, False, ...,  True,  True, False],
       [False, False, False, ...,  True,  True, False],
       [ True, False, False, ...,  True,  True, False]])

In [12]:
# Saving arrays to .py file. Can easily be loaded with np.load('path' + 'filename')
####np.save('tX_cleaned', tX)
####np.save('tX_standardized', tX_standardized)
# Commented this section out to prevent overwriting of dataset

In [13]:
# Cleaning nan values and replacing them with mean (can also be median or interpolation)
from clean_nan import clean_nan
tx_ = clean_nan(tX_standardized, feature_mean)


## Do your thing crazy machine learning thing here :) ...

### Linear regression using gradient descent

In [14]:
#from least_squares_GD import *
from costs import compute_mse
from least_squares_GD import *
from parameter_tuning import gamma_tuning_SGD

# Initialization of the weights
initial_w = np.zeros(tx_.shape[1])

# Define the parameters necessary for gradient descent: need to tune gamma. 
# we use the gamma_tuning_SGD for less costly tuning
max_iters = 50
gamma = gamma_tuning_SGD(tx_, y, initial_w, 1000, max_iters)
#gamma=0.0000001

#w1, loss1 = least_squares_GD(y, tx_, initial_w, max_iters, gamma)
#print(w1)
#print(loss1)

[-1.23032492e-04 -4.68683738e-05 -7.67613927e-05 -5.45887273e-05
 -1.79758431e-06 -3.50833288e-04  3.51704547e-07 -2.22081221e-06
 -1.82737099e-05 -1.49871156e-04 -1.64530880e-06  3.29941617e-07
  1.14363197e-08 -3.63119276e-05 -7.45562178e-09  4.15404186e-09
 -4.42101078e-05  1.20479257e-09 -3.52098663e-08 -3.93942645e-05
  2.24092356e-08 -1.98464242e-04 -1.26071155e-06 -8.01430558e-05
  8.38551156e-09  1.42248815e-08 -5.45339454e-05  1.29077320e-08
 -7.57439822e-09  4.22163069e-07]
0.45263941362763455


In [15]:
print(gamma)

1e-07


### Least squares with SGD

In [16]:
from least_squares_SGD import *
#No need to re-tune gamma BUT NOT SURE
#Here batch_size is set at 1


# Initialization of the weights BUT we could use w1 as a start
initial_w = np.zeros(tx_.shape[1])


w2, loss2 = least_squares_SGD(y, tx_, initial_w, 1, max_iters, gamma)
#print(w2)
#print(loss2)

### Least Squares using Normal Equations

In [81]:
from least_squares import *
from build_polynomial import *
from parameter_tuning import degree_tuning_LS

degree = degree_tuning_LS(y , tx_)
# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1]) 

w3, loss3 = least_squares(y, tx_)
_, loss3_ = least_squares(y, poly_x)
print(loss3)
print(loss3_)

0.39799914050690927
0.37969719657315903


In [82]:
print(degree)

4


### Ridge regression

In [59]:
from ridge_regression import *
from parameter_tuning import lambda_tuning_ridge

# Tuning of lambda
lambda_ = lambda_tuning_ridge(y, tx_)
print(lambda_)

# Computing the loss
w4, loss4 = ridge_regression(y, tx_, lambda_)

1e-15


### Logistic Regression

In [47]:
from logistic_regression import *

# Initialization of the weights BUT could use w2
initial_w = w3
w5, loss5 = logistic_regression(y, tx_, initial_w, max_iters, gamma)
print(w5)
print(loss5)

[1.43825356 1.42560418 1.43287537 ... 1.44123761 1.60587793 1.60587793]
[inf inf inf ... inf inf inf]


  loss = calculate_loss_log(y,tx,w)
  # store w and loss
  # *************************************************


[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf inf ... inf inf inf]
[inf inf i

# Predictions

In [None]:
## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)