In [2]:
# Useful starting lines
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
from IPython import display
# Import everything in the functions folder
from functions.costs import *
from functions.helpers import *
from functions.split import *
from functions.ridge_regression import *
from functions.helpers import *
from functions.least_squares_GD import *
from functions.clean_data import *
import pickle
from os.path import basename, splitext

In [3]:
DATA_FOLDER = 'data'

TRAINING_DATA = ['train_jet_0_wout_mass.csv' , 'train_jet_0_with_mass.csv',
                 'train_jet_1_wout_mass.csv' , 'train_jet_1_with_mass.csv',
                 'train_jet_2_wout_mass.csv' , 'train_jet_2_with_mass.csv',
                 'train_jet_3_wout_mass.csv' , 'train_jet_3_with_mass.csv']

TEST_DATA = ['test_jet_0_wout_mass.csv' , 'test_jet_0_with_mass.csv',
             'test_jet_1_wout_mass.csv' , 'test_jet_1_with_mass.csv',
             'test_jet_2_wout_mass.csv' , 'test_jet_2_with_mass.csv',
             'test_jet_3_wout_mass.csv' , 'test_jet_3_with_mass.csv']

In [9]:
for i in range(len(TRAINING_DATA)):
    base_name_train = splitext(basename(TRAINING_DATA[i]))[0]
    base_name_test = splitext(basename(TEST_DATA[i]))[0]
    print("PCA for file %s"%base_name_train)
    # Load the training data
    y_train, x_train, ids_train = load_csv_data('data/' + TRAINING_DATA[i])
    # Standardize the matrix
    x = std(x_train)
    #x = x_train
    # Mean vector
    mean_vec = np.mean(x)
    # Cov matrix
    cov_mat = np.cov(x.T)
    # Eigen vectors and eigen values
    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    for ev in eig_vecs:
        np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev), decimal=3)
    print('Everything ok!')    
    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

    # Sort the (eigenvalue, eigenvector) tuples from high to low
    eig_pairs.sort(key=lambda x: x[0], reverse=True)
    # Get cumulative variance
    tot = sum(eig_vals)
    var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
    cum_var_exp = np.cumsum(var_exp)
    # Projection Matrix
    array = []
    threshold = 90
    
    for j in range(len(eig_vals)):
        #if eig_vals[i] < 1:
        #    break
        array.append(eig_pairs[j][1].reshape(len(eig_vals),1))

        if cum_var_exp[j] > threshold:
            break
    matrix_w = np.hstack(array) 
    print("Reduced from %i features to %i features."%(len(matrix_w), len(matrix_w[0])))
    # Recreate the headers
    headers = ['Id', 'Prediction']
    for k in range(len(matrix_w[0])):
        headers.append('Var %i'%k)
    # Projected variables for training
    proj_train = std(x_train).dot(matrix_w)        
    # Write the new train file
    write_data('data/' + base_name_train + '_pca.csv', y_train, proj_train, 
               ids_train, headers, 'train')
    # Load the test data
    _, x_test, ids_test = load_csv_data('data/' + TEST_DATA[i])    
    # Projected variables for test   
    proj_test = std(x_test).dot(matrix_w)
    # Write the new test file
    write_data('data/' + base_name_test + '_pca.csv', _, proj_test, 
               ids_test, headers, 'test')
    print('------------------------')
print('PCA finished')

PCA for file train_jet_0_wout_mass
Everything ok!
Reduced from 17 features to 9 features.
------------------------
PCA for file train_jet_0_with_mass
Everything ok!
Reduced from 18 features to 10 features.
------------------------
PCA for file train_jet_1_wout_mass
Everything ok!
Reduced from 21 features to 13 features.
------------------------
PCA for file train_jet_1_with_mass
Everything ok!
Reduced from 22 features to 12 features.
------------------------
PCA for file train_jet_2_wout_mass
Everything ok!
Reduced from 28 features to 17 features.
------------------------
PCA for file train_jet_2_with_mass
Everything ok!
Reduced from 29 features to 16 features.
------------------------
PCA for file train_jet_3_wout_mass
Everything ok!
Reduced from 28 features to 17 features.
------------------------
PCA for file train_jet_3_with_mass
Everything ok!
Reduced from 29 features to 16 features.
------------------------
PCA finished
