In [1]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import datetime
from helpers import *

## Load the training data into feature matrix, class labels, and event ids:

In [2]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../train.csv' # TODO: download train data and supply path here 
y, tX_starting, ids = load_csv_data(DATA_TRAIN_PATH,sub_sample=False)

# Pandas

In [None]:
import pandas as pd

In [None]:
# General data description
pd_data=pd.read_csv(DATA_TRAIN_PATH)
pd_data=pd_data.replace({'s':1,'b':-1})
del pd_data['Id']
pd_data.Prediction.describe()

In [None]:
# Number of nan
pd_data[pd_data==-999].count()

In [None]:
# Correlation matrix
pd_data2=pd_data.replace({-999:0})
corr_matrix2=pd_data2.corr()
corr_matrix2.Prediction

In [None]:
pd_data3=pd_data.replace({-999:-10})
corr_matrix3=pd_data3.corr()

In [None]:
corr_matrix=pd_data.corr()
corr_matrix.Prediction


In [None]:
synthesis=corr_matrix[['Prediction']].copy()
synthesis['PredictionCorrected']=corr_matrix2.Prediction
synthesis['PredictionNan-10']=corr_matrix3.Prediction
synthesis['id']=range(-1,synthesis.shape[0]-1)
synthesis

In [None]:
plt.matshow(pd_data2.corr())

In [None]:
plt.scatter(pd_data2[[24]],pd_data2[[22]])

# Data analysis

In [None]:
names=pd_data.columns.values

for i in range(tX_starting.shape[1]):
    plt.figure()
    plt.title(names[i+1]+" "+str(i))
    plt.hist(tX_starting[:,i],bins=70)

In [None]:
names=pd_data.columns.values

for i in range(tX_starting.shape[1]):
    plt.figure()
    plt.title(names[i+1]+" "+str(i))
    plt.boxplot(tX_starting[:,i])

# Preprocessing

### Delete columns with low correlation

In [None]:
drop_columns=[]
for i in range(tX_starting.shape[1]):
    coeff=np.corrcoef(y,tX_starting[:,i])[0,1]
    if abs(coeff)<0.000:
        drop_columns.append(i)
tX=np.delete(tX_starting,drop_columns,axis=1)
tX.shape

### Categorical variables

In [None]:
cat_variable=22
values=[0,1,2]

added_matrix=np.zeros([tX.shape[0],3])
added_matrix[:,0]=np.array([tX[:,22]==0])
added_matrix[:,1]=np.array([tX[:,22]==1])
added_matrix[:,2]=np.array([tX[:,22]==2])

In [None]:
tX=np.delete(tX,[22],axis=1)
print(tX.shape)

### Polynomial regression

In [3]:
def build_poly(tX,degree,ones=True,columns_to_consider=False):
    if not columns_to_consider:
        columns_to_consider=range(tX.shape[1])
    # Add ones column
    if ones:
        ones=np.ones(tX.shape[0]).reshape([tX.shape[0],1])
        tX=np.concatenate((tX,ones),axis=1)
    # Add power of the matrix
    for i in range(2,degree+1):
        tX=np.concatenate((tX,tX[:,columns_to_consider]**i),axis=1)
    return tX

In [5]:
tX=tX_starting.copy()
tX=build_poly(tX,6)
tX.shape

(250000, 181)

### Append categorical variables

In [None]:
tX=np.concatenate((tX,added_matrix),axis=1)
tX.shape

### NaN treatment

In [None]:
tX[tX==-999]=0

### Normalizing

In [None]:
# Normalizing
#mean=np.sum(tX,axis=0)/tX.shape[0]
#std=np.sqrt(np.sum(tX**2,axis=0)/tX.shape[0])
#tX=(tX-mean)/std

In [None]:
tX.shape

# Splitting data

In [6]:
def split_data(x, y, ratio, seed=1):
    """split the dataset based on the split ratio."""
    # set seed
    np.random.seed(seed)
    print(x.shape)
    train_elements=int(ratio*x.shape[0])
    test_elements=x.shape[0]-train_elements
    print(train_elements,test_elements)
    indices = np.random.permutation(x.shape[0])
    training_idx, test_idx = indices[:train_elements], indices[train_elements:]
    x_train, x_test = x[training_idx], x[test_idx]
    y_train, y_test = y[training_idx], y[test_idx]
    return x_train,x_test,y_train,y_test

In [7]:
tX_train,tX_test,y_train,y_test=split_data(tX,y,0.5,1)
tX_test.shape

(250000, 181)
125000 125000


(125000, 181)

## Do your thing crazy machine learning thing here :) ...

In [8]:
def compute_predictions(tX,w):
    prediction=tX.dot(w)
    prediction[np.where(prediction <= 0)] = -1
    prediction[np.where(prediction > 0)] = 1
    return prediction

In [9]:
def evaluate_prediction(prediction,y):
    return (sum(y*prediction)/y.shape[0]+1)/2

In [10]:
def evaluate(y,tX,w):
    prediction=compute_predictions(tX,w)
    return evaluate_prediction(prediction,y)

In [None]:
def compute_loss(y, tX, w):
    """Calculate the loss.

    You can calculate the loss using mse or mae.
    """
    error= y-tX.dot(w)
    square=np.sum(error**2)/error.shape[0]
    return square

In [None]:
def compute_gradient(y, tX, w):
    """Compute the gradient."""
    N=tX.shape[0]
    error=y-tX.dot(w)
    gradient=-1.0/N*(np.transpose(tX).dot(error))
    return gradient
        
compute_gradient(y_train,tX_train,np.zeros([tX_train.shape[1]])).shape

In [None]:
def gradient_descent(y, tX, initial_w, max_iters, gamma): 
    """Gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    for n_iter in range(max_iters):
        # Compute gradient and loss
        gradient=compute_gradient(y,tX,w)
        loss=compute_loss(y,tX,w)
        # Update w by gradient
        w=w-gamma*gradient
        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        print("Gradient Descent({bi}/{ti}): loss={l}".format(
              bi=n_iter, ti=max_iters - 1, l=loss))

    print(w.shape)
    return loss, w

In [None]:
def compute_stoch_gradient(y, tx, w):
    """Compute a stochastic gradient for batch data."""
    N=tx.shape[0]
    error=y-tx.dot(w)
    gradient=-1.0/N*(np.transpose(tx).dot(error))
    return gradient


def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_epochs, gamma):
    """Stochastic gradient descent algorithm."""
    # Define parameters to store w and loss
    ws = [initial_w]
    losses = []
    w = initial_w
    y_shuffle=[]
    tx_shuffle=[]
    for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size):
        y_shuffle.append(minibatch_y)
        tx_shuffle.append(minibatch_tx)
    for n_iter in range(max_epochs):
        # compute stochastic gradient
        gradient=compute_stoch_gradient(y_shuffle[n_iter],tx_shuffle[n_iter],w)
        loss=compute_loss(y,tx,w)
        # update w
        w=w-gamma*gradient
        # store w and loss
        ws.append(np.copy(w))
        losses.append(loss)
        print("Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format(
              bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1]))

    return loss, w

In [21]:
from implementations import *

gradient_descent = least_squares_GD

tX

array([[  1.38470000e+02,   5.16550000e+01,   9.78270000e+01, ...,
          3.63521508e+00,   2.29853552e+02,   2.13750084e+12],
       [  1.60937000e+02,   6.87680000e+01,   1.03235000e+02, ...,
          9.94014980e+17,   9.94014980e+17,   9.75703560e+09],
       [ -9.99000000e+02,   1.62172000e+02,   1.25953000e+02, ...,
          9.94014980e+17,   9.94014980e+17,   7.50824675e+09],
       ..., 
       [  1.05457000e+02,   6.05260000e+01,   7.58390000e+01, ...,
          9.94014980e+17,   9.94014980e+17,   5.48276155e+09],
       [  9.49510000e+01,   1.93620000e+01,   6.88120000e+01, ...,
          9.94014980e+17,   9.94014980e+17,   0.00000000e+00],
       [ -9.99000000e+02,   7.27560000e+01,   7.08310000e+01, ...,
          9.94014980e+17,   9.94014980e+17,   0.00000000e+00]])

In [12]:
# Define the parameters of the algorithm.
max_iters = 50
gamma = 0.00001

# Initialization
w_initial = np.ones(tX.shape[1])

# Start gradient descent.
start_time = datetime.datetime.now()
w, mse = gradient_descent(y, tX, w_initial, max_iters, gamma)
#gradient_losses, w = stochastic_gradient_descent(y_train, tX_train, w_initial,30, max_iters, gamma)
end_time = datetime.datetime.now()

# Print result
exection_time = (end_time - start_time).total_seconds()
print("Gradient Descent: execution time={t:.3f} seconds".format(t=exection_time))

print(w)
print(mse)
print(evaluate(y_train, tX_train, w))
print(evaluate(y_test, tX_test, w))

Gradient Descent: execution time=2.207 seconds
[ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
  nan]
nan
nan
nan


  app.launch_new_instance()


In [22]:
mse,w=least_squares(y_train,tX_train)
print(evaluate(y_train,tX_train,w))
print(evaluate(y_test,tX_test,w))

ValueError: operands could not be broadcast together with shapes (125000,) (125000,181) 

In [None]:
def ridge_regression(y, tx, lamb):
    """implement ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # ridge regression: TODO
    # ***************************************************
    w=np.linalg.solve(tx.T.dot(tx)+lamb**2*np.identity(tx.shape[1]),tx.T.dot(y))
    mse=sum((y-tx.dot(w))**2)/tx.shape[0]
    return mse,w

mse,w=ridge_regression(y_train,tX_train,0)

In [None]:
perc_tr=[]
perc_te=[]

lambdas = np.linspace(-2,2,101)
for lamb in lambdas:
    mse,w=ridge_regression(y_train,tX_train,lamb)
    
    perc_tr.append(evaluate(y_train,tX_train,w))
    perc_te.append(evaluate(y_test,tX_test,w))
    

plt.plot(lambdas,perc_tr,label='train',color='r')
plt.plot(lambdas,perc_te,label='test')
plt.legend()

In [None]:
# Basic implementation of logistic regression using the least squares
def logistic_regression(y,tx,tx_test,threshold=0.5):
    mse,w=least_squares(y,tx)
    
    y_prev=tx.dot(w)
    y_prev=1/(1+np.exp(-y_prev))
    output_train=np.ones(y_prev.shape[0])
    output_train[np.where(y_prev<threshold)] = -1
    
    y_test=tx_test.dot(w)
    y_test=1/(1+np.exp(-y_test))
    output_test=np.ones(y_test.shape[0])
    output_test[np.where(y_test<threshold)] = -1
    
    return output_train,output_test

thresholds=np.linspace(0.48,0.52,101)
perc_log_tr=[]
perc_log_te=[]
for threshold in thresholds:
    output_train,output_test=logistic_regression(y_train,tX_train,tX_test,threshold)
    perc_log_tr.append(evaluate_prediction(output_train,y_train))
    perc_log_te.append(evaluate_prediction(output_test,y_test))

    
plt.plot(thresholds,perc_log_tr,'r',label='Train')
plt.plot(thresholds,perc_log_te,'b',label='Test')
plt.legend()

# Test on training dataset

In [None]:
tX_test_post=np.copy(tX_test)
tX_test_post[tX_test_post==-999]=0
prediction=compute_predictions(tX_test,w)
evaluate_prediction(prediction,y_test)

In [None]:
print(sum((y_test-tX_test.dot(w))**2)/tX_test.shape[0])
print(tX_test.dot(w))

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../../test.csv' # TODO: download train data and supply path here 
_, tX_final_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_final_test=build_poly(tX_final_test,6)

In [None]:
OUTPUT_PATH = '../../predictions.csv' # TODO: fill in desired name of output file for submission
y_pred = predict_labels(w, tX_final_test)
y_train,y_test = logistic_regression(y_train,tX_train,tX_final_test,0.48)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
print(y_pred)