In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Additionally added libraries

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv'
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)
#### Remember to unpack data from .zip folders

Investigating loaded data

In [3]:
tX

array([[ 138.47 ,   51.655,   97.827, ...,    1.24 ,   -2.475,  113.497],
       [ 160.937,   68.768,  103.235, ..., -999.   , -999.   ,   46.226],
       [-999.   ,  162.172,  125.953, ..., -999.   , -999.   ,   44.251],
       ...,
       [ 105.457,   60.526,   75.839, ..., -999.   , -999.   ,   41.992],
       [  94.951,   19.362,   68.812, ..., -999.   , -999.   ,    0.   ],
       [-999.   ,   72.756,   70.831, ..., -999.   , -999.   ,    0.   ]])

In [4]:
tX.shape


[2. 1. 1. 0. 0. 3. 2. 1. 0. 1.]


In [5]:
# Setting -999 to NaN values
tX[tX==-999] = np.nan
tX


[2. 1. 1. 0. 0. 3. 2. 1. 0. 1.]


In [7]:
# Numerical exploratory data analysis
# row1: mean | row2: variance | row3: standard deviation | row4: minimum value | row5: maximum value | row6: number of NaN's
feature_details = np.zeros([7, tX.shape[1]])
for i in range(tX.shape[1]):
    feature_details[0, i] = np.nanmean(tX[:,i])
    feature_details[1, i] = np.nanvar(tX[:,i])
    feature_details[2, i] = np.nanstd(tX[:,i])
    feature_details[3, i] = np.nanmin(tX[:,i])
    feature_details[4, i] = np.nanmax(tX[:,i])
    feature_details[5, i] = np.isnan(tX[:,i]).sum()
    feature_details[6, i] = np.nanmedian(tX[:,i])
#.... I dont know how to represent this in a nice way without pandas.DataFrame, without doing a lot of coding...
#print(feature_details)

In [8]:
# Importing pandas just to make this numerical EDA nicer to view. 
# IMPORTANT!!! DO NOT USE IN PROJECT SUBMISSION
import pandas as pd
df = pd.DataFrame(feature_details)
df.index = ['Mean', 'Variance', 'Std', 'min', 'max', 'n-NaNs', 'median']
df.index.name = 'Statistics'
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
Statistics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mean,121.858528,49.239819,81.181982,57.895962,2.403735,371.78336,-0.821688,2.3731,18.917332,158.432217,...,-0.010119,209.797178,0.979176,84.822105,-0.003275,-0.012393,57.679474,-0.011845,-0.001582,73.064591
Variance,3283.063262,1249.255942,1666.975303,4052.029594,3.035311,158162.573194,12.847474,0.612947,496.106539,13387.851528,...,3.284138,16002.060938,0.955358,3679.887218,3.184583,3.288345,1023.076126,4.127921,3.301261,9607.031571
Std,57.298021,35.344815,40.828609,63.655554,1.742214,397.696584,3.584337,0.78291,22.273449,115.705884,...,1.812219,126.499253,0.977424,60.662074,1.78454,1.813379,31.985561,2.031729,1.816937,98.015466
min,9.044,0.0,6.329,0.0,0.0,13.602,-18.066,0.208,0.0,46.104,...,-3.142,13.678,0.0,30.0,-4.499,-3.142,30.0,-4.5,-3.142,0.0
max,1192.026,690.075,1349.351,2834.999,8.503,4974.979,16.69,5.684,2834.999,1852.462,...,3.142,2003.976,3.0,1120.573,4.499,3.141,721.456,4.5,3.142,1633.433
n-NaNs,38114.0,0.0,0.0,0.0,177457.0,177457.0,177457.0,0.0,0.0,0.0,...,0.0,0.0,0.0,99913.0,99913.0,99913.0,177457.0,177457.0,177457.0,0.0
median,112.406,46.524,73.752,38.4675,2.107,225.885,-0.244,2.4915,12.3155,120.6645,...,-0.024,179.739,1.0,65.561,0.0,-0.033,47.902,-0.01,-0.002,40.5125


We have to standardize dataset - a wide variety of ranges can be observed in the dataset. Would lead to a biased learning algorithm

In [9]:
# store mean and variance
feature_mean = feature_details[0, :]
feature_std = feature_details[2, :]
feature_median = feature_details[6, :]

In [10]:
#Cleaning nans
from clean_nan import *

tx_ = clean_nan(tX, feature_median)

In [11]:
# create standardized dataset
#should be put into a method
for i in range(tx_.shape[1]):
    if (i!=22):
        tx_[:,i] = (tx_[:,i] - feature_mean[i])/feature_std[i]

print(tx_)

[[ 2.89913530e-01  6.83319669e-02  4.07680272e-01 ...  6.16147878e-01
  -1.36131161e+00  4.12510497e-01]
 [ 6.82021310e-01  5.52504823e-01  5.40136414e-01 ...  9.08223761e-04
  -2.29898355e-04 -2.73819964e-01]
 [-1.64971287e-01  3.19515553e+00  1.09655998e+00 ...  9.08223761e-04
  -2.29898355e-04 -2.93969845e-01]
 ...
 [-2.86249472e-01  3.19316447e-01 -1.30863670e-01 ...  9.08223761e-04
  -2.29898355e-04 -3.17017229e-01]
 [-4.69606588e-01 -8.45323970e-01 -3.02973380e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.45439413e-01]
 [-1.64971287e-01  6.65336083e-01 -2.53522760e-01 ...  9.08223761e-04
  -2.29898355e-04 -7.45439413e-01]]


In [12]:
tx_[0:10,22]

array([2., 1., 1., 0., 0., 3., 2., 1., 0., 1.])

In [13]:
# Double checking array
tx_.shape

(250000, 30)

In [14]:
import pandas as pd
Test = pd.DataFrame(tx_) 
Test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.289914,0.068332,0.407680,-0.469966,-0.857377,-0.621258,0.973036,0.882478,1.033099,0.339894,...,-0.147267,0.386847,2.0,-0.286622,1.206627,0.251681,-0.363210,0.616148,-1.361312,0.412510
1,0.682021,0.552505,0.540136,-0.153167,-0.170321,-0.366858,0.161170,1.404888,-0.756027,-0.287584,...,-1.051683,-0.357719,1.0,-0.636248,0.408102,0.645421,-0.305684,0.000908,-0.000230,-0.273820
2,-0.164971,3.195156,1.096560,-0.349710,-0.170321,-0.366858,0.161170,0.989770,-0.430168,0.340361,...,-1.200672,0.400135,1.0,-0.668805,1.152271,-1.111520,-0.305684,0.000908,-0.000230,-0.293970
3,0.384768,0.910379,-0.005853,-0.903016,-0.170321,-0.366858,0.161170,1.196690,-0.830735,-0.712705,...,0.038692,-0.978149,0.0,-0.317515,0.001835,-0.011364,-0.305684,0.000908,-0.000230,-0.745439
4,0.942536,-0.914556,1.313369,-0.651804,-0.170321,-0.366858,0.161170,1.938794,-0.112795,-0.868143,...,-0.475042,-1.238475,0.0,-0.317515,0.001835,-0.011364,-0.305684,0.000908,-0.000230,-0.745439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,-0.164971,0.643636,-1.093204,-0.830312,-0.170321,-0.366858,0.161170,-1.253146,-0.622954,-0.886214,...,1.583208,-0.514882,0.0,-0.317515,0.001835,-0.011364,-0.305684,0.000908,-0.000230,-0.745439
249996,-0.164971,0.252913,-0.320829,-0.557013,-0.170321,-0.366858,0.161170,0.270657,0.158111,-0.931795,...,-0.472835,-1.022845,0.0,-0.317515,0.001835,-0.011364,-0.305684,0.000908,-0.000230,-0.745439
249997,-0.286249,0.319316,-0.130864,-0.284955,-0.170321,-0.366858,0.161170,0.021586,0.146617,-0.328162,...,-1.589146,-0.086089,1.0,-0.706044,1.010498,-0.084708,-0.305684,0.000908,-0.000230,-0.317017
249998,-0.469607,-0.845324,-0.302973,-0.697378,-0.170321,-0.366858,0.161170,1.266941,-0.243040,-0.886500,...,0.453102,-0.767429,0.0,-0.317515,0.001835,-0.011364,-0.305684,0.000908,-0.000230,-0.745439


In [15]:
# Saving arrays to .py file. Can easily be loaded with np.load('path' + 'filename')
####np.save('tX_cleaned', tX)
####np.save('tX_standardized', tX_standardized)
# Commented this section out to prevent overwriting of dataset

## Do your thing crazy machine learning thing here :) ...

### Polynomial Expansion

In [16]:
from build_polynomial import*
tx_ = build_poly(tx_, degree=2)

### Linear regression using gradient descent

In [17]:
#from least_squares_GD import *
from costs import compute_mse
from least_squares_GD import *
from parameter_tuning import gamma_tuning_SGD
from parameter_tuning import gamma_tuning_GD

# Initialization of the weights
initial_w = np.zeros(tx_.shape[1])
# Define the parameters necessary for gradient descent: need to tune gamma. 
# we use the gamma_tuning_SGD for less costly tuning
max_iters = 50
gamma = gamma_tuning_GD(y, tx_, initial_w, max_iters)

w1, loss1 = least_squares_GD(y, tx_, initial_w, max_iters, gamma)


In [18]:
print(w1)
print(loss1)

[-1.56421763e-04  1.61064854e-05 -1.66509013e-04 -5.95784163e-06
  9.17488001e-05  7.13970251e-05  9.76098102e-05 -6.52239914e-05
  5.88701787e-06 -6.79307110e-06  7.32371563e-05 -9.23839491e-05
  1.28989331e-04  4.59095883e-05  1.11902521e-04 -4.43027894e-07
 -2.08955241e-06 -1.45673955e-05  7.26680141e-07  1.95210360e-06
  1.10745695e-05  3.54773351e-06  6.47505656e-05 -9.09667804e-05
  6.34418075e-05 -8.54267396e-08  1.04747183e-06  4.01699090e-05
  4.67764367e-08 -8.75563081e-07  6.41473778e-05 -3.43273837e-04
 -1.67167634e-04 -3.98190226e-04 -4.39932273e-06  2.67167610e-05
  5.54781528e-05  3.15983994e-05 -2.32935639e-04 -8.91964491e-05
 -1.19306228e-04 -1.68983872e-04 -1.18613587e-04 -7.28247491e-06
 -3.19504424e-05 -2.14063589e-04 -1.57096074e-04 -1.60774879e-04
 -2.27558879e-04 -1.57693530e-04  8.73117469e-06 -1.56585610e-04
 -1.46582835e-04 -1.89653725e-04 -4.10290982e-05 -2.96541303e-08
 -5.78263039e-05 -5.94804625e-05  1.87942388e-05 -1.50877087e-05
 -1.38330574e-04]
0.99831

### Least squares with SGD

In [19]:
from least_squares_SGD import *
#No need to re-tune gamma BUT NOT SURE
#Here batch_size is set at 1


# Initialization of the weights BUT we could use w1 as a start AND careful with tuning
initial_w = np.zeros(tx_.shape[1])
gamma = gamma_tuning_GD(y, tx_, initial_w, max_iters)

w2, loss2 = least_squares_SGD(y, tx_, initial_w, 1, max_iters, gamma)
#print(w2)
print(loss2)

0.9986094795174398


### Least Squares using Normal Equations

In [20]:
from least_squares import *
from build_polynomial import *
from parameter_tuning import degree_tuning_LS

# degree = degree_tuning_LS(y , tx_)
# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1]) 

w3, loss3 = least_squares(y, tx_)
#_, loss3_expand = least_squares(y, poly_x)
print(loss3)
#print(loss3_expand)


0.7942644907345137


### Ridge regression

In [21]:
from ridge_regression import *
from parameter_tuning import lambda_tuning_ridge

# Tuning of lambda
lambda_ = lambda_tuning_ridge(y, tx_)

# Computing the loss
w4, loss4 = ridge_regression(y, tx_, lambda_)
#print(w4)
print(loss4)

[-2.63111831e-01  1.47266876e-01 -2.43829448e-01 -3.08236396e-01
  1.40559507e-01 -1.04769211e-01 -2.14882454e-02 -1.21353005e-01
  2.60084229e-01 -9.85768352e-03 -2.34804069e+02 -1.89024149e-01
  4.07272541e-02  1.28947317e-01  4.56793045e+01 -5.92619823e-04
  1.46679685e-04  4.50106722e+01  7.85046580e-04  1.89339154e-03
  7.42601090e-03  6.24533643e-04 -3.25812640e-02 -2.29069142e-02
  1.04650722e-01 -1.66705812e-04  9.85384053e-04  5.46322454e-02
  1.25020335e-03 -2.30359596e-03  1.98828722e+02 -2.23868264e-02
  3.62792138e-02  1.69459325e-02  3.31057858e-03  6.48220893e-02
 -1.35246520e-03 -2.69044573e-02 -5.96121517e-02  7.54987826e-04
 -7.00676127e-03  1.82416826e-02  5.61851664e-02  7.55385377e-02
 -1.42062796e-02 -3.37266499e-02 -1.90636841e-03 -1.97768807e-02
 -5.65040568e-02 -4.92036762e-04 -3.23446521e-03 -8.15830315e-04
 -2.18581124e-02 -2.84789367e-02 -1.72903131e-02  1.27378547e-01
  1.79342518e-04 -1.01928592e-03  1.44181245e-01 -1.57972934e-04
  1.71507065e-02]
0.79426

### Logistic regression

In [22]:
from logistic_regression import *
from parameter_tuning import param_tuning_log

# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1])

# Tuning of gamma
lambda_=0.0
max_iters=50
gamma = param_tuning_log(y, tx_, initial_w, max_iters, lambda_)

w5, loss5 = logistic_regression(y, tx_, initial_w, max_iters, gamma, lambda_)
print(loss5)
#print(w5)
print(gamma)

  a = np.exp(-value)
  loss = - (1 / tx.shape[0]) * np.sum((y * np.log(a)) + ((1 - y) * np.log(1 - a)))


0.10290153506270981
0.001


### Regularized logistic regression

In [25]:
from reg_logistic_regression import *
from parameter_tuning import param_tuning_reg_log

# Initialization of the weights BUT could use w2
initial_w = np.zeros(tx_.shape[1])

# Tuning of gamma
lambda_=5
gamma = param_tuning_reg_log(y, tx_, initial_w, max_iters, lambda_)
#gamma=0.00000001
w6, loss6 = reg_logistic_regression(y, tx_, lambda_, initial_w, max_iters, gamma)
#print(loss6)
#print(w6)
print(gamma)

0.002154434690031882


In [26]:
print(loss6)

0.20519960108320173


# Predictions

In [None]:
## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [42]:
# STANDARDIZE TX TEST
test_mean = np.zeros(tX_test.shape[1]).T
test_std = np.zeros(tX_test.shape[1]).T
for i in range(tX_test.shape[1]):
    test_mean[i] = np.nanmean(tX_test[:,i])
    test_std[i] = np.nanstd(tX_test[:,i])
for i in range(tX_test.shape[1]):
    if (i!=22):
        tX_test[:,i] = (tX_test[:,i] - test_mean[i])/test_std[i]

In [44]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w3, tX_test) # CAREFUL IT'S W3
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)

In [38]:
weights = w2

In [39]:
OUTPUT_PATH = '../data/sample-submission.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tX_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)