# Project 1: Higgs Boson Detection

In [1]:
import numpy as np
import csv
from regression_tools import * 
from cross_validation_ridge import *
from cross_validation_lasso import *
from preprocessing import *
from load_data import *
from implementations import *
import matplotlib.pyplot as plt

# Lasso


In [2]:
# Cross validation
x_train,y_train,x_test,ids_test=load_data('train.csv','test.csv')
# Setting parameters
degree=20
k_fold=3
gamma=1e-7
lambdas=np.logspace(-8,-1,num=5)
seed=1

In [4]:
lambdas =np.logspace(-8,-1,num=5)
gammas=np.linspace(1e-4,5e-3,num=4)
degrees=np.arange(10,14)
max_iters=200
batch_size=1
mat3D=cross_validation_lasso_demo(y_train,x_train,degrees,k_fold,lambdas,gammas,max_iters,seed)

0.0001
0.00173333333333
0.00336666666667
0.005


In [5]:
for i in range(len(gammas)):
    plot_cross_validation(lambdas,mat3D[i],degrees,'lasso'+str(i))
result,[best_gamma_ind,best_lambda_ind,best_degree_ind]=find_the_maximum_3D(mat3D)

In [None]:
def plot_cross_validation(lambdas,cost_te,degrees,regression_type):
    plt.figure()
    string=[]
    for s in range(lambdas.size):
        plt.plot(degrees,cost_te[s])
        string.append(str(lambdas[s]))
    plt.xlabel('degree')
    plt.ylabel('train accuracy')
    plt.legend(string)
    plt.savefig('cross_validation '+regression_type+'.png')

In [None]:
def find_the_maximum_3D(tensor):
    max_mat=np.max(tensor,axis=0)
    depth_mat=np.argmax(tensor,axis=0)
    _,[ind_row,ind_col]=find_the_maximum(max_mat)
    ind_depth=depth_mat[ind_row,ind_col]
    max_tensor=np.max(tensor)
    return max_tensor,[ind_depth,ind_row,ind_col] 

In [6]:
# hyperparameters Lasso
print(degrees[best_degree_ind],lambdas[best_lambda_ind],gammas[best_gamma_ind],result)

10 1e-08 0.005 0.719990879964


In [None]:
plot_cross_validation_lasso(np.log10(lambdas),mat,gammas)

In [None]:
x_train_cleaned,nmc_tr=cleaning_function(x_train,-999)
x_train_cleaned,noaf=features_augmentation(x_train_cleaned,not_augm_features=nmc_tr+1)
phi_train=build_polinomial(x_train_cleaned,degrees[best_degree_ind],not_poly_features=noaf+nmc_tr+1,nm=-999,already_cleaned=True)
phi_train=norm_data(phi_train,not_norm_features=nmc_tr+1,skip_first_col=True)

x_test_cleaned,nmc_te=cleaning_function(x_test,-999)
x_test_cleaned,noaf=features_augmentation(x_test_cleaned,not_augm_features=nmc_te+1)
phi_test=build_polinomial(x_test_cleaned,degrees[best_degree_ind],not_poly_features=noaf+nmc_te+1,nm=-999,already_cleaned=True)
phi_test=norm_data(phi_test,not_norm_features=nmc_te+1,skip_first_col=True)

loss,w=lasso_regression_SGD(y_train, phi_train, lambdas[best_lambda_idx],initial_w,max_iters,gammas[best_gamma_ind])

In [None]:
y_test=phi_test.dot(w)
y_pred=[]
for i in range(y_test.shape[0]):
    if y_test[i]>0.5:
        y_pred.append(1)
    else:
        y_pred.append(-1)
        
create_csv_submission(ids_test, y_pred, 'submission_lasso_sgd.csv')

## Try ridge


In [23]:
from AIC import *

In [24]:
x_train,y_train,x_test,ids_test=load_data('train.csv','test.csv')

In [46]:
seed = 1
degrees = np.arange(5,16)
k_fold = 4
# To use ridge regression
lambdas = np.logspace(-8,-1,num=5)
print(lambdas.size)
cost_te=cross_validation_demo(y_train,x_train,degrees,k_fold,lambdas,seed)
plot_cross_validation(lambdas,cost_te,degrees,'ridge')
result_ridge,best_param_ind=find_the_maximum(cost_te)
print('best degree is '+str(degrees[best_param_ind[1]]))
print('best lambda is '+str(lambdas[best_param_ind[0]]))

5
1e-08


KeyboardInterrupt: 

In [None]:
x_train_cleaned,nmc_tr=cleaning_function(x_train,-999)
x_train_cleaned,noac_tr=features_augmentation(x_train_cleaned,not_augm_features=nmc_tr+1)
x_train_cleaned=norm_data(x_train_cleaned,not_norm_features=nmc_tr+1)
phi_tr=build_polinomial(x_train_cleaned,degree=degrees[best_param_ind[1]],not_poly_features=nmc_tr+1+noac_tr)
w,loss=ridge_regression(y_train,phi_tr,lambdas[best_param_ind[0]])
x_test_cleaned,nmc_te=cleaning_function(x_test,-999)
x_test_cleaned,noac_te=features_augmentation(x_test_cleaned,not_augm_features=nmc_te+1)
x_test_cleaned=norm_data(x_test_cleaned,not_norm_features=nmc_te+1)
phi_te=build_polinomial(x_test_cleaned,degree=degrees[best_param_ind[1]],not_poly_features=nmc_te+1+noac_te)
y_test=phi_te.dot(w)
y_pred=[]
for i in range(y_test.shape[0]):
    if y_test[i]>0.5:
        y_pred.append(1)
    else:
        y_pred.append(-1)
        #b=-1
        
create_csv_submission(ids_test, y_pred, 'submission.csv')

In [44]:
def super_features_augmentation(x,y,lambda_=0,not_super_features=0,is_train=True,augmentation=True,skip_first_column=False):
    if skip_first_column: 
        d=1
    else:
        d=0
    x_to_augm=x[:,d:x.shape[1]-not_super_features]
    column_added=0
    temp=(np.min(np.absolute(x_to_augm),axis=0))[:]!=0
    is_not_zero=np.where(temp)[0]
    log_col=np.log(np.absolute(x_to_augm[:,is_not_zero]))
    is_zero=np.where(1- 1*temp)[0]
    rad_col=np.sqrt(np.absolute(x_to_augm[:,is_zero]))
    if rad_col.shape[1]>0 and log_col.shape[1]>0:
        rad_log_col=np.concatenate((rad_col,log_col),axis=1)
    elif rad_col.shape[1]>0:
        rad_log_col=rad_col
    else :
        rad_log_col=log_col
    if augmentation:
        rad_log_col=features_augmentation(rad_log_col)
    if is_train:
        important_col=compare_aic_ridge(y_train,rad_log_col,lambda_)
    else:
        important_col=y
    rad_log_col=rad_log_col[:,important_col]
    if d>0 and not_super_features>0:
        x=np.concatenate((x[:,:d],x_to_augm,rad_log_col,x[:,(x.shape[1]-not_super_features):]),axis=1)
    elif d>0:
        x=np.concatenate((x[:,:d],x_to_augm,rad_log_col),axis=1)
    elif not_super_features>0:
        x=np.concatenate((x_to_augm,rad_log_col,x[:,(x.shape[1]-not_super_features):]),axis=1)
    else:
        x=np.concatenate((x_to_augm,rad_log_col),axis=1)
    return x, important_col
        
        
    

In [45]:
def cross_validation_ridge(y, phi, k_indices, k, lambda_, degree, not_poly_features):
    """
    Return the proportion of correct classifications of ridge/linear regression in a step of k-fold cross-validation.
    """
    
    # Get k'th subgroup in test, others in train    
    train_indices = np.delete(k_indices , k , 0).reshape((k_indices.shape[0]-1) * k_indices.shape[1])
    x_test = phi[k_indices[k],:]
    x_train = phi[train_indices,:]
    y_test = y[k_indices[k]]
    y_train = y[train_indices]
    
    # Form data with polynomial degree
    tx_train = build_polinomial(x_train, degree, not_poly_features)
    tx_test = build_polinomial(x_test, degree, not_poly_features)

    # Ridge regression / Linear regression
    if lambda_!=0:
        w, loss = ridge_regression(y_train, tx_train, lambda_)
    else:
        w, loss = least_squares(y_train,tx_train)
   
    
    # Calculate proportion of correct classification for given lambda and degree
    result=(y_test==(tx_test.dot(w)>0.5)).sum()/y_test.shape[0]
    return result

def cross_validation_demo(y_train,x_train,degrees,k_fold,lambdas,seed):
    """
    Performs cross-validation with ridge regression.
    Returns a matrix which stores the proportion of correct classifications where:
        rows: lambda
        columns: degree of polynomial of the features.
    """

    # Split data in k fold
    k_indices = build_k_indices(y_train, k_fold, seed)
    # Clean data 
    x_train_cleaned,nmc_tr=cleaning_function(x_train,-999)
    # Cross validation steps
    cost_te=np.zeros((lambdas.size,degrees.size))
    for ind_lamb,lambda_ in enumerate(lambdas):
        print(lambda_)
        if lambda_!=0:
            x_train_agm,super_col=super_features_augmentation(x_train_cleaned,y_train,lambda_,not_super_features=nmc_tr+1,is_train=True,augmentation=False)
            super_col_nb=len(super_col)
            x_train_agm,noaf=features_augmentation(x_train_agm,not_augm_features=nmc_tr+1)
            x_train_agm=norm_data(x_train_agm,not_norm_features=nmc_tr+1)
        for ind_deg, degree_ in enumerate(degrees):
            loss_te = np.zeros(k_fold)
            for k in range (k_fold):
                result = cross_validation_ridge(y_train, x_train_agm, k_indices, k , lambda_, degree_, nmc_tr+1+noaf+super_col_nb)
                loss_te[k]= result

            cost_te[ind_lamb,ind_deg]=loss_te.mean()
    return cost_te

In [None]:
# test logistic
degree=13
x_train_cleaned,nmc_tr=cleaning_function(x_train,-999)
#x_train_cleaned,noaf=features_augmentation(x_train_cleaned,not_augm_features=nmc_tr+1)
#print(x_train_cleaned.shape)
#phi_train=build_polinomial(x_train_cleaned,degree,not_poly_features=noaf+nmc_tr+1,nm=-999,already_cleaned=True)
#print(phi_train.shape)
#phi_train=norm_data(phi_train,not_norm_features=nmc_tr+1,skip_first_col=True)
#print(phi_train.shape)
phi_train=x_train_cleaned
initial_w=np.zeros(phi_train.shape[1])
batch_size=1
max_iters=100
gamma=1e-4
w,loss=logistic_regression(y_train, phi_train, initial_w, max_iters, gamma)
result=((phi_train.dot(w)[:]>0.5)[:]==y_train[:]).sum()/len(y_train)

In [None]:
print((w[:]==0).sum(),result)
print(w)

# 81% DO NOT TOUCH

In [None]:
x_train,y_train,x_test,ids_test=load_data('train.csv','test.csv')
seed = 1
degrees = np.arange(5,16)
k_fold = 4
# To use ridge regression
lambdas = np.logspace(-8,-1,num=5)
print(lambdas.size)
cost_te=cross_validation_demo(y_train,x_train,degrees,k_fold,lambdas,seed)
plot_cross_validation(lambdas,cost_te,degrees,'ridge')
_,best_param_ind=find_the_maximum(cost_te)
print('best degree is '+str(degrees[best_param_ind[1]]))
print('best lambda is '+str(lambdas[best_param_ind[0]]))


In [None]:
print(np.min(w),np.max(w))

In [None]:
# continuation of the previous script
x_train_cleaned,nmc_tr=cleaning_function(x_train,-999)
x_train_cleaned,noac_tr=features_augmentation(x_train_cleaned,not_augm_features=nmc_tr+1)
x_train_cleaned=norm_data(x_train_cleaned,not_norm_features=nmc_tr+1)
phi_tr=build_polinomial(x_train_cleaned,degree=degrees[best_param_ind[1]],not_poly_features=nmc_tr+1+noac_tr)
loss,w=ridge_regression(y_train,phi_tr,lambdas[best_param_ind[0]])
x_test_cleaned,nmc_te=cleaning_function(x_test,-999)
x_test_cleaned,noac_te=features_augmentation(x_test_cleaned,not_augm_features=nmc_te+1)
x_test_cleaned=norm_data(x_test_cleaned,not_norm_features=nmc_te+1)
phi_te=build_polinomial(x_test_cleaned,degree=degrees[best_param_ind[1]],not_poly_features=nmc_te+1+noac_te)
y_test=phi_te.dot(w)
y_pred=[]
for i in range(y_test.shape[0]):
    if y_test[i]>0.5:
        y_pred.append(1)
    else:
        y_pred.append(-1)
        #b=-1
        
create_csv_submission(ids_test, y_pred, 'submission.csv')

In [None]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [None]:
#logistic_regression_penalized_gradient_descent_demo(y, x)

In [None]:
x_train,y_train,x_test,ids_test=load_data('train.csv','test.csv')
x_train_cleaned,aa=cleaning_function(x_train)
tx=build_polinomial(x_train_cleaned,degree=1,not_poly_features=0)
tx.shape

In [None]:
compare_aic(y_train,tx)

In [None]:
a=list([1,2,3])
a.append(4)
print(a)
a[1]

In [None]:
a=[]
a.append(0)

temp=a.copy()
temp.append(2)
temp

In [None]:
def compare_aic(y,tx):
    dimx=tx.shape[1]
    loss=np.zeros(dimx) #contains loss for all models with m variables
    best_loss=np.zeros(dimx) #contains best loss of model with m variables
    models=[] #list of best models
    variables=list(range(dimx)) #list of variables
    for ind in range(dimx):
        for m in variables:
            temp=models.copy()
            #print(m)
            #print(ind)
            temp.append(m)
            #print(temp)
            #print(tx[:,temp].shape)
            [loss[m],w]=logistic_regression_gradient_descent_demo(y,tx[:,temp])
        b=np.argmmin(loss)
        models.append(b)
        variables.remove(b)
        best_loss[ind]=loss.min()
        
    idx_loss=np.argmin(best_loss)
    model=models[:idx_loss]
    return model

In [None]:
def sign(x):
    """
    Computes the sign() function.
    """
    true_vec1=x[:]>0
    true_vec2=x[:]<0
    x=1*true_vec1-1*true_vec2
    return x



In [None]:
a=np.zeros((3,2,2))
a[2,1,1]=1
print(np.argmax(a,axis=0))

In [None]:
? np.argmax

In [17]:
a=np.arange(4)
print(len(a))

4
