In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../data/train.csv' # TODO: download train data and supply path here 
y, tX, ids = load_csv_data(DATA_TRAIN_PATH)

## Do your thing crazy machine learning thing here :) ...

### Prepare the data

In [4]:
from helpers import standardize

def prepare(x):
    """
    Prepare the data by standardizing and replacing unused 
    values (-999) by the mean of their columns such that they
    don't affect the computation then.
    """
    # Here we 
    xt = x.T
    for xi in xt:
        xi[xi==-999] = np.nan
        m = np.nanmean(xi)
        nanidx = np.where(np.isnan(xi))
        xi[nanidx] = m
        
    tx, mean, std = standardize(tX)
    
    return tx
    
tx = prepare(tX)

### Exploratory data analysis

In [5]:
rows, features = tx.shape
print('Number of data entries:', rows)
print('Number of feature:', features)
print(tx.shape)
print(tx[:, 1].shape)

Number of data entries: 250000
Number of feature: 31
(250000, 31)
(250000,)


In [6]:
plt.rcParams['agg.path.chunksize'] = 10000
feature1 = tx[:, 2]
print(feature1)
#plt.scatter(feature1, y)
#plt.xlabel('x')
#plt.ylabel('y: prediction')
#plt.show()

[ 0.06833197  0.55250482  3.19515553 ...,  0.31931645 -0.84532397
  0.66533608]


In [7]:
#for f in range(0, features):
#    featureData = tx[:, f]
#    plt.scatter(featureData, feature1)
#    plt.xlabel('x')
#    plt.ylabel('y: prediction')
#    plt.show()

### Exploratory data comments

For feature : 

    4-  > 40 =>  1
    5-  > 6  =>  1
    
    1-  > 20 => -1 [10,20] => more likely to be -1
    3-  > 20 => -1
    21- > 10 => -1
    26- > 22 => -1
    29- > 13 => -1
    

## Process the Machine Learning

#### Differents learning methods

In [116]:
from least_squares import *
from regression import *

def learn_with(method, y, tx, gamma=0.1, max_iters=5, lambda_=0.1):
    if method == 'least_squares':
        return least_squares(y, tx)
    
    if method == 'least_square_GD': 
        return least_squares_GD(y, tx, gamma, max_iters)

    if method == 'least_square_SGD': 
        return least_squares_SGD(y, tx, gamma, max_iters)

    if method == 'logistic_regression': 
        return logistic_regression(y, tx, gamma, max_iters)

    if method == 'pen_logistic_regression': 
        return pen_logisitic_regression(y, tx, lambda_, gamma, max_iters)
    
    if method == 'ridge_regression': 
        return ridge_regression(y, tx, lambda_)
    
    return least_squares(y, tx)

#### Training phase

In [117]:
def compare_least_squares(y, tx):
    l3, w3 = learn_with("least_square", y, tx)
    l2, w2 = learn_with("least_square_GD", y, tx, 0.1, 50)
    l1, w1 = learn_with("least_square_SGD", y, tx, 0.1, 50)
    print("SGD")
    print(l1)
    print(w1)
    print("GD")
    print(l2)
    print(w2)
    print("--")
    print(l3)
    print(w3)
    

In [142]:
def compare_regression(y, tx):
    l1, w1 = learn_with('logistic_regression', y, tx)
    print("Log reg")
    print(l1)
    print(w1)
    l2, w2 = learn_with('pen_logistic_regression', y, tx)
    print("Pen Log reg")
    print(l2)
    print(w2)
    l3, w3 = learn_with("ridge_regression", y, tx)
    print("Ridge reg")
    print(l3)
    print(w3)

In [144]:
def train(y, tx):
    
    loss, w = learn_with("least_square_GD", y, tx)
    
    return loss, w

#loss, weights = train(y, tx)
#compare_least_squares(y, tx)
compare_regression(y, tx)


Log reg
93254.5058299
[ -1.39849830e+00   1.78090752e-02  -4.40944685e-01  -4.66931976e-01
   8.10940708e-03   3.48395499e-02   1.60262222e-01   9.56700605e-03
   4.97598662e-01  -4.90049006e-02  -5.56885306e+02  -3.33278630e-01
   2.01937762e-01   1.33011428e-01   1.08189475e+02  -1.36787002e-03
  -1.40263168e-03   1.06704339e+02  -1.49117383e-03   4.37429465e-03
   1.75242900e-01   1.54414961e-03  -8.18800511e-02   7.25449437e-02
  -8.27775112e-02   1.09688268e-03   2.90379811e-04  -6.45691195e-02
   2.74633823e-03  -2.99914140e-03   4.71755446e+02]
Pen Log reg
93264.5647878
[ -1.39851035e+00   1.78122504e-02  -4.40988230e-01  -4.66910535e-01
   8.07499266e-03   3.47687882e-02   1.60251047e-01   9.49104963e-03
   4.97555280e-01  -4.90212645e-02  -1.90873585e+01  -3.33311594e-01
   2.01911950e-01   1.33009744e-01   4.01886845e+00  -1.37514024e-03
  -1.42422083e-03   4.14737720e+00  -1.49420016e-03   4.34613399e-03
   1.75253304e-01   1.52960369e-03  -8.18587052e-02   7.23107928e-02
  

Regression is still doing some shit but at least it don't crash.. Maybie a problem with loss computation ? 

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../data/test.csv' # TODO: download train data and supply path here 
_, tX_test, ids_test = load_csv_data(DATA_TEST_PATH)

tx_test = prepare(tX_test)


In [None]:
OUTPUT_PATH = '../output/out.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(weights, tx_test)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)