In [None]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import datetime
from helpers import *
from implementations import *

## Load the training data into feature matrix, class labels, and event ids:

In [None]:
from proj1_helpers import *
DATA_TRAIN_PATH = '../../train.csv' # TODO: download train data and supply path here 
y_starting, tX_starting, ids = load_csv_data(DATA_TRAIN_PATH,sub_sample=False)

# Preprocessing

The preprocessing that we made consists in different steps:
    1. Delete the columns with low correlation with y (not used since it does not improve the results)
    2. Replace the nan in the feature matrix with the median of the feature
    3. Add 3 dummy variable corresponding to the 3 existing nan pattern. Fully removing the nan is not a valid solution since they seem correlated with y.
    4. Replace the existing categorical variable with dummy variables.
    5. Adding more features
    6. Normalize the data 
    7. Add a column of all 1.

* The function before_poly() summarize the first 4 points.
* The function build_poly() add all the wanted features
* The function process_data() automatically call all this methods

The median, mean and standard deviation, computed in the train matrix are returned. 
They are used to normalize and to replace nan in the test matrix.
We *don't* use the median, mean and standard deviation of the test matrix for this purpose.

### Delete columns with low correlation - replacing nan

In [None]:
# This function was not used since it does not improve the results

def drop_col(tX_starting):
    drop_columns=[]
    #for i in range(tX_starting.shape[1]):
    #    coeff=np.corrcoef(y,tX_starting[:,i])[0,1]
    #    if abs(coeff)<0.000:
    #        drop_columns.append(i)


    tX=np.delete(tX_starting,drop_columns,axis=1)
    return tX
    

In [None]:
def replace_nan(tX_starting,median=False):
    tX=tX_starting.copy()
    # Dummy features added corresponding to the nan pattern
    nan_position=[tX[:,[0,4,23]]!=-999][0]*1

    for col in range(tX.shape[1]):
        column=tX[:,col][tX[:,col]!=-999]
        if median==False:
            mean=column.mean()
            median=np.median(column)

        tX[:,col][tX[:,col]==-999]=median    
    
    return nan_position,tX,median
    

### Categorical variables

In [None]:
def categorical_variables(tX_starting):
    tX=tX_starting.copy()
    
    cat_variable=22
    values=[0,1,2]

    added_matrix=np.zeros([tX.shape[0],3])
    added_matrix[:,0]=np.array([tX[:,22]==0])
    added_matrix[:,1]=np.array([tX[:,22]==1])
    added_matrix[:,2]=np.array([tX[:,22]==2])
    
    tX=np.delete(tX,[22],axis=1)
    
    return added_matrix,tX

The function before_poly() calls automatically all the previous functions

In [None]:
def before_poly(tX_starting,median=False):
    tX=drop_col(tX_starting)
    nan_position,tX,median=replace_nan(tX,median)
    #added_matrix,tX=categorical_variables(tX)
    #full_added_matrix=np.concatenate((added_matrix,nan_position),axis=1)
    return nan_position,tX,median
    return full_added_matrix,tX,median

### Adding features

Multiple kind of features added:
    1. Powers of the existing features (not categorical, not dummy variables) up to a specified degree.
    2. Square root and half powers (i.e. 3/2,5/2,7/2) up to a specified degree
    3. Exponential of the existing features (not categorical, not dummy variables)
    4. Cross products of the features (i.e. x1*x2,x1*x3..). It is possible to retain only the more correlated ones with y, but this possibility was not used for the submission.

In [None]:
def build_poly(tX,degree,y,prod_to_exclude=False,train=True,columns_to_consider=False,exponential=False,cross_products=False,added_matrix_for_cross=False,threshold_power=0.0,threshold_cross=0.00,exclude=False):
    # Some feature can be not considered
    if not columns_to_consider:
        columns_to_consider=range(tX.shape[1])
    # Cross products to exclude
    if not prod_to_exclude:
        prod_to_exclude=[]
    if exclude==False:
        exclude=[]
    dict_cross={}
    
    # Features to include in hte model
    columns_to_consider=[x for x in columns_to_consider if x not in exclude]
    columns_to_consider=np.array(columns_to_consider)
    # Add power of the matrix
    final_list=[]
    for i in range(2,degree+1):
        cols=columns_to_consider
        tX=np.concatenate((tX,tX[:,cols]**i),axis=1)
    for i in range(2,18):
        if i%2==1:
            tX=np.concatenate((tX,np.sqrt(abs(tX[:,cols]**i))),axis=1)
    # Take the exponential of the features
    if exponential:
        tX=np.concatenate((tX,np.exp(tX[:,cols]/100)),axis=1)
        tX=np.concatenate((tX,np.exp(tX[:,cols]/80)),axis=1)
        tX=np.concatenate((tX,np.exp(tX[:,cols]/60)),axis=1)
        tX=np.concatenate((tX,np.exp(tX[:,cols]/50)),axis=1)
        tX=np.concatenate((tX,np.exp(tX[:,cols]/40)),axis=1)
        tX=np.concatenate((tX,np.exp(tX[:,cols]/20)),axis=1)

    # Cross products of the features
    if cross_products:
        # The dummy variables are considered for the cross products
        if added_matrix_for_cross.any():
            # Add to columns to consider
            for i in range(tX.shape[1],tX.shape[1]+added_matrix_for_cross.shape[1]):
                columns_to_consider=np.append(columns_to_consider,i)
            # Concatenate
            tX=np.concatenate((tX,added_matrix_for_cross),axis=1)
            final_list.append(tX)
        start_cross=tX.shape[1]
        for i,col1 in enumerate(columns_to_consider):
            for j,col2 in enumerate(columns_to_consider):
                if j>i and (i,j) not in prod_to_exclude:
                    if train:
                        prod=tX[:,col1]*tX[:,col2]
                        corr=np.corrcoef(prod,y)[0,1]
                        if abs(corr)>threshold_cross:
                            final_list.append(prod.reshape([prod.shape[0],1]))

                            #print(start_cross,type(start_cross))
                            dict_cross[start_cross]=tuple([i,j])
                            start_cross+=1
                        else:
                            prod_to_exclude.append((i,j))
                    else:
                        prod=tX[:,col1]*tX[:,col2]
                        final_list.append(prod.reshape([prod.shape[0],1]))
        final_tuple=tuple(final_list)
        tX=np.concatenate(final_tuple,axis=1)
    return tX,prod_to_exclude,dict_cross


### Normalizing

In [None]:
# If train==False the mean and the std are the ones computed in the train matrix.
def normalize(tX,mean=False,std=False,train=False):
    if train:
        mean=np.sum(tX,axis=0)/tX.shape[0]
        std=np.sqrt(np.sum(tX**2,axis=0)/tX.shape[0])
    tX=(tX-mean)/std
    if train:
        return tX,mean,std
    else:
        return tX

In [None]:
def add_ones(tX_starting):
    ones=np.ones(tX_starting.shape[0]).reshape([tX_starting.shape[0],1])
    tX=np.concatenate((tX_starting,ones),axis=1)
    return tX

It is necessary to discriminate between the train and the test matrix.
The median, mean and std computed in the train matrix are used in the test one.
prod_to_exclude are the cross products to exclude in the test matrix. 

In [None]:
def process_data(tX_starting,y,prod_to_exclude=False,train=True,mean=False,std=False,median=False,exclude=False):
    full_added_matrix,tX,median=before_poly(tX_starting,median)
    tX,prod_to_exclude,dict_cross=build_poly(tX,14,y,exclude=exclude,train=train,prod_to_exclude=prod_to_exclude,exponential=True,cross_products=True,added_matrix_for_cross=full_added_matrix,threshold_cross=0.0)
    if train:
        tX,mean,std=normalize(tX,train=True)
    else:
        tX=normalize(tX,mean,std,train=False)
    tX=add_ones(tX)
    
    if train:
        return tX,prod_to_exclude,mean,std,median,dict_cross
    else:
        return tX

## Process data

In [None]:
# CALL THIS FOR CROSS VALIDATION
tX=tX_starting.copy()
y=y_starting.copy()

In [None]:
# CALL THIS FOR COMPUTING PREDICTIONS WITHOUT CROSS VALIDATION (FOR SUBMISSION)
exc=[]
tX=tX_starting.copy()
y=y_starting.copy()
tX,prod_to_exclude,mean,std,median,dict_cross=process_data(tX,y,train=True,exclude=exc)

In [None]:
tX.shape

# Cross Validation

In [None]:
# Function to create the indices to split train and test matrix in Cross Validation
def build_k_indices(y, k_fold, seed):
    """build k indices for k-fold."""
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    #np.random.seed(seed)
    indices = np.random.permutation(num_row)
    k_indices = [indices[k * interval: (k + 1) * interval]
                 for k in range(k_fold)]
    return np.array(k_indices)
k_indices=build_k_indices(y,4,1)
np.save("k_indices",k_indices)

In [None]:
# Function that operates the cross validation with a specified k and lambda_ (for ridge regression)
def cross_validation(y, x, k_indices, k, lambda_):
    """return the loss of ridge regression."""
    # ***************************************************
    # INSERT YOUR CODE HERE
    # get k'th subgroup in test, others in train: TODO
    # ***************************************************
    
    loss_tr=[]
    loss_te=[]
    ac_tr=[]
    ac_te=[]
    w_vector=[]
    for k_index in k_indices:
        
        x_test=x[k_index]
        y_test=y[k_index]
        
        mask = np.ones(x.shape[0], dtype=bool) # all elements included/True.
        mask[k_index] = False              # Set unwanted elements to False

        x_train=x[mask]
        y_train=y[mask]
        exc=[]
        x_train,prod_to_exclude,mean,std,median,dict_cross=process_data(x_train,y_train,train=True,exclude=exc)
        x_test=process_data(x_test,y_test,prod_to_exclude=prod_to_exclude,mean=mean,std=std,median=median,train=False,exclude=exc)

        w,mse=ridge_regression(y_train,x_train,lambda_)
        w_vector.append(w)
        ac_tr.append(evaluate(y_train,x_train,w))
        ac_te.append(evaluate(y_test,x_test,w))
        
        
    return ac_tr,ac_te,np.mean(ac_tr), np.mean(ac_te),w


In [None]:
lamb=-5e-05

In [None]:
# RUN THIS TO TEST THE CROSS VALIDATION
ac_tr,ac_te,mean_tr,mean_te,w=cross_validation(y,tX,k_indices,4,lamb)

In [None]:
print(mean_te,mean_tr,ac_te,ac_tr)

The next script is used to optimize the lambda in the ridge regression

In [None]:
perc_tr=[]
perc_te=[]

lambdas = np.linspace(-0.00008,-0.00001,5)
for i,lamb in enumerate(lambdas):
    ac_tr,ac_te,mean_tr,mean_te,w=cross_validation(y,tX,k_indices,4,lamb)
    perc_tr.append(mean_tr)
    perc_te.append(mean_te)
    print(mean_tr,mean_te,lamb)
    
    if i%1==0:
        print(i)

plt.plot(lambdas,perc_tr,label='train',color='r')
plt.plot(lambdas,perc_te,label='test')
plt.legend()

# Submission

With the chosen lambda, compute the weights using the function ridge_regression().
Then check the predictions in the train dataset.

In [None]:
lamb=-6e-05

In [None]:
w,loss=ridge_regression(y,tX,lamb)

In [None]:
print(evaluate(y,tX,w))

## Generate predictions and save ouput in csv format for submission:

In [None]:
DATA_TEST_PATH = '../../test.csv' 
_, tX_final_test, ids_test = load_csv_data(DATA_TEST_PATH)

In [None]:
tX_final=process_data(tX_final_test,y=y,prod_to_exclude=prod_to_exclude,mean=mean,std=std,median=median,train=False)

In [None]:
tX_final.shape

In [None]:
OUTPUT_PATH = '../../predictions.csv' 
y_pred = predict_labels(w, tX_final)
create_csv_submission(ids_test, y_pred, OUTPUT_PATH)
print(y_pred)