In [1]:
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
import os
from helpers import *
from preprocessing import *
import pickle

In [2]:
#x_train, x_test, y_train, train_ids, test_ids=load_csv_data("data")

In [3]:
DATA_FOLDER = 'data/'

In [4]:
try:
    with open(DATA_FOLDER + 'x_train.pickle', 'rb') as f:
        x_train = pickle.load(f)
    with open(DATA_FOLDER + 'x_test.pickle', 'rb') as f:
        x_test = pickle.load(f)
    with open(DATA_FOLDER + 'y_train.pickle', 'rb') as f:
        y_train = pickle.load(f)
    with open(DATA_FOLDER + 'test_ids.pickle', 'rb') as f:
        test_ids = pickle.load(f)
    with open(DATA_FOLDER + 'names_map.pickle', 'rb') as f:
        names_map = pickle.load(f)
except:
    x_train, x_test, y_train, train_ids, test_ids = load_csv_data(DATA_FOLDER, sub_sample=False)

    names = np.genfromtxt(DATA_FOLDER + 'x_train.csv', delimiter=",", dtype=str, max_rows=1)
    names = np.delete(names, 0)
    names_map = {}
    for i in range(len(names)):
        names_map[names[i]] = i

    with open(DATA_FOLDER + 'x_train.pickle', 'wb') as f:
        pickle.dump(x_train, f)

    with open(DATA_FOLDER + 'x_test.pickle', 'wb') as f:
        pickle.dump(x_test, f)

    with open(DATA_FOLDER + 'y_train.pickle', 'wb') as f:
        pickle.dump(y_train, f)

    with open(DATA_FOLDER + 'test_ids.pickle', 'wb') as f:
        pickle.dump(test_ids, f)
    
    with open(DATA_FOLDER + 'names_map.pickle', 'wb') as f:
        pickle.dump(names_map, f)

## Preprocessing and Scalings

In [5]:
DATA_FOLDER = 'data/'

In [6]:
names = np.genfromtxt(DATA_FOLDER + 'x_train.csv', delimiter=",", dtype=str, max_rows=1)
names = np.delete(names, 0)
names_map = {}
for i in range(len(names)):
    names_map[names[i]] = i

In [7]:
y_train = np.where(y_train == -1, 0, y_train)
x_tr = x_train.copy()
y_tr = y_train.copy()


In [8]:
#clean the data and return the cleaned dataset, the y values, the mean and median dictionnaries to use to clean other sets (test set for example)
x_tr_clean, y_tr_clean, mean_dico, median_dico,intersting_features = clean_data(names_map, x_tr, y_tr, is_y=True, is_train_data=True)

#if we don't have the y values, we use is_y=False.
x_test_clean, _, _, _, _ = clean_data(names_map, x_test, y_raw=None, is_y=False, is_train_data=False, mean_dico=mean_dico, median_dico=median_dico)

#compute the scaled train dataset and the mean and std used for the scaling
x_tr_scaled, train_mean, train_std = scale_data(x_tr_clean, is_train_data=True)

#scale the test set using the mean and std computed on the train set
x_test_scaled, _, _ = scale_data(x_test_clean, is_train_data=False, train_mean=train_mean, train_std=train_std)

In [9]:
x_tr_scaled.shape

(328135, 321)

## Feature selection

We chose some initial features by reading the report and assessing how they seemed relevant for our predictions.

In [10]:
intersting_features_indexes=np.array([names_map[feature] for feature in intersting_features])

In [14]:
X_train=x_tr_scaled[:,intersting_features_indexes]
X_test=x_test_scaled[:,intersting_features_indexes]

In [15]:
np.unique(np.where(np.isnan(X_train))[1])

array([], dtype=int64)

In [16]:
intersting_features_indexes.shape

(86,)

#### By correlation with the output

In [17]:
# import seaborn  as sns

# plt.figure(figsize=(10,6))
# sns.heatmap(correlations,annot=False)

In [24]:
selected_features_names=intersting_features[:18]

In [21]:
X_test.shape

(109379, 86)

In [22]:
X_train=X_train[:,np.arange(18)]
X_test=X_test[:,np.arange(18)]

#### Build polynomial expension and interaction features

In [23]:
X_train.shape

(328135, 18)

In [25]:
X_train_poly,features_names=build_poly_expension_with_interaction_features(X_train,features_names=selected_features_names,max_degree=3)
X_test_poly,features_names=build_poly_expension_with_interaction_features(X_test,features_names=selected_features_names,max_degree=3)

In [26]:
X_train_poly.shape

(328135, 207)

In [27]:
np.where(np.isnan(X_train_poly))

(array([], dtype=int64), array([], dtype=int64))

In [28]:
X_train_poly, train_mean_poly, train_std_poly=scale_data(X_train_poly,is_train_data=True)
X_test_poly,_,_=scale_data(X_test_poly,is_train_data=False,train_mean=train_mean_poly,train_std=train_std_poly)

In [None]:
X_train_poly.shape

(328135, 207)

#### Again selecting features with correlation with the outcome y

In [None]:
np.corrcoef(np.hstack(X_train_poly, y_train.reshape(-1,1)))[:,len(y)]

#### Ridge regression

In [29]:
X_train_poly

array([[-7.89478099e-01, -1.26864627e-01, -2.77479888e-01, ...,
        -3.06623340e-03, -5.62366465e-03,  1.51645339e-02],
       [-1.01629713e-15, -3.36757191e-01, -2.33028204e-01, ...,
        -1.22971865e-02, -1.49412452e-02, -2.40876265e-02],
       [-7.89478099e-01, -1.26864627e-01, -2.77479888e-01, ...,
         1.87156946e-02,  1.40772764e-02,  1.20019247e-02],
       ...,
       [-7.89478099e-01, -1.26864627e-01, -2.77479888e-01, ...,
        -1.20076300e-02, -1.47744741e-02,  5.38062173e-02],
       [-1.01629713e-15, -3.36757191e-01, -2.33028204e-01, ...,
         3.64296914e-02,  1.01755335e-02,  3.15014222e-02],
       [-4.87532670e-01, -2.56714012e-01, -2.43496576e-01, ...,
         2.86653734e-01,  3.12581216e-01,  9.52659973e-01]])

In [31]:
w_opt,loss=ridge_regression(y_tr_clean,X_train_poly,lambda_=0.01)
print(compute_scores_linear_model(X_train_poly,w_opt,y=y_tr_clean,threshold=0.3,apply_sigmoid=False))
thr_l=np.arange(-0.5,0.7,0.02)
for lambda_ in [0.001,0.01,0.1,0.3]:
    w_opt,loss=ridge_regression(y_tr_clean,X_train_poly,lambda_=lambda_)
    
    f1_scores=[compute_scores_linear_model(X_train_poly,w_opt,y_tr_clean,threshold=t,apply_sigmoid=False)[2] for t in thr_l]
    plt.plot(thr_l,f1_scores,marker='o',label=str(lambda_))
    plt.xlabel("threshold")
    plt.ylabel("f1-score")
plt.legend(title="$\\lambda$ value")
plt.title("Ridge regression results")
plt.show()

(0.3535980148883375, 0.009836065573770493, 0.01913971995567644)
