In [11]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload

# Additionally added libraries

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the training data into feature matrix, class labels, and event ids:

In [22]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
#### Remember to unpack data from .zip folders

Investigating loaded data

In [24]:
tX

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [25]:
tX.shape

(250000, 30)

In [27]:
# Setting -999 to NaN values
tX[tX==-999] = np.nan
tX

array([[138.47 ,  51.655,  97.827, ...,   1.24 ,  -2.475, 113.497],
       [160.937,  68.768, 103.235, ...,     nan,     nan,  46.226],
       [    nan, 162.172, 125.953, ...,     nan,     nan,  44.251],
       ...,
       [105.457,  60.526,  75.839, ...,     nan,     nan,  41.992],
       [ 94.951,  19.362,  68.812, ...,     nan,     nan,   0.   ],
       [    nan,  72.756,  70.831, ...,     nan,     nan,   0.   ]])

In [51]:
# Numerical exploratory data analysis
# row1: mean | row2: variance | row3: standard deviation | row4: minimum value | row5: maximum value | row6: number of NaN's
feature_details = np.zeros([6, tX.shape[1]])
for i in range(tX.shape[1]):
    feature_details[0, i] = np.nanmean(tX[:,i])
    feature_details[1, i] = np.nanvar(tX[:,i])
    feature_details[2, i] = np.nanstd(tX[:,i])
    feature_details[3, i] = np.nanmin(tX[:,i])
    feature_details[4, i] = np.nanmax(tX[:,i])
    feature_details[5, i] = np.isnan(tX[:,i]).sum()
#.... I dont know how to represent this in a nice way without pandas.DataFrame, without doing a lot of coding...

In [81]:
# Importing pandas just to make this numerical EDA nicer to view. 
# IMPORTANT!!! DO NOT USE IN PROJECT SUBMISSION
import pandas as pd
df = pd.DataFrame(feature_details)
df.index = ['Mean', 'Variance', 'Std', 'min', 'max', 'n-NaNs']
df.index.name = 'Statistics'
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Statistics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591
Variance,3283.063262,1249.255942,1666.975303,4052.029594,3.035311,158162.573194,12.847474,0.612947,496.106539,13387.851528,...,3.284138,16002.060938,0.955358,3679.887218,3.184583,3.288345,1023.076126,4.127921,3.301261,9607.031571
Std,57.298021,35.344815,40.828609,63.655554,1.742214,397.696584,3.584337,0.78291,22.273449,115.705884,...,1.812219,126.499253,0.977424,60.662074,1.78454,1.813379,31.985561,2.031729,1.816937,98.015466
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433
n-NaNs,38114.0,0.0,0.0,0.0,177457.0,177457.0,177457.0,0.0,0.0,0.0,...,0.0,0.0,0.0,99913.0,99913.0,99913.0,177457.0,177457.0,177457.0,0.0


We have to standardize dataset - a wide variety of ranges can be observed in the dataset. Would lead to a biased learning algorithm

In [66]:
# store mean and variance
feature_mean = feature_details[0, :]
feature_std = feature_details[2, :]

In [70]:
# create standardized dataset
tX_standardized = np.zeros(tX.shape)
for i in range(tX_standardized.shape[1]):
    tX_standardized[:,i] = (tX[:,i] - feature_mean[i])/feature_std[i]

In [71]:
# Double checking array
tX_standardized.shape

(250000, 30)

In [74]:
tX_standardized

array([[ 0.28991353,  0.06833197,  0.40768027, ...,  0.61614788,
        -1.36131161,  0.4125105 ],
       [ 0.68202131,  0.55250482,  0.54013641, ...,         nan,
                nan, -0.27381996],
       [        nan,  3.19515553,  1.09655998, ...,         nan,
                nan, -0.29396985],
       ...,
       [-0.28624947,  0.31931645, -0.13086367, ...,         nan,
                nan, -0.31701723],
       [-0.46960659, -0.84532397, -0.30297338, ...,         nan,
                nan, -0.74543941],
       [        nan,  0.66533608, -0.25352276, ...,         nan,
                nan, -0.74543941]])

In [80]:
# Saving arrays to .py file. Can easily be loaded with np.load('path' + 'filename')
####np.save('tX_cleaned', tX)
####np.save('tX_standardized', tX_standardized)
# Commented this section out to prevent overwriting of dataset

## Do your thing crazy machine learning thing here :) ...

## Generate predictions and save ouput in csv format for submission:

In [10]:
DATA_TEST_PATH = '' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [31]:
OUTPUT_PATH = '' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)