In [4]:
import sys
import os
from helpers import load_csv_data, create_csv_submission
from run_helpers import load_useless_features_file
from implementations import least_squares, reg_logistic_regression, ridge_regression, logistic_regression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [5]:
ROOT_DIR = os.path.abspath(os.curdir)
DATA_DIR = ROOT_DIR + "/" + "data"

In [6]:
x_train_initial, x_test_initial, y_train_initial, train_ids_initial, test_ids_initial, column_names_initial = load_csv_data(DATA_DIR)

In [7]:
x_train, x_test, y_train, train_ids, test_ids, column_names = x_train_initial.copy(), x_test_initial.copy(), y_train_initial.copy(), train_ids_initial.copy(), test_ids_initial.copy(), column_names_initial.copy()

In [8]:
x_train

array([[5.3000000e+01, 1.1000000e+01, 1.1162015e+07, ...,           nan,
                  nan, 2.0000000e+00],
       [3.3000000e+01, 1.2000000e+01, 1.2152015e+07, ...,           nan,
                  nan,           nan],
       [2.0000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 1.0000000e+00,
        2.0000000e+00, 2.0000000e+00],
       ...,
       [3.9000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 2.0000000e+00,
        2.0000000e+00, 2.0000000e+00],
       [3.3000000e+01, 1.2000000e+01, 1.2302015e+07, ...,           nan,
                  nan, 2.0000000e+00],
       [3.2000000e+01, 9.0000000e+00, 9.1220150e+06, ...,           nan,
                  nan, 2.0000000e+00]])

# Data cleaning

## Removing useless features

In [9]:
useless_features_names = load_useless_features_file(os.path.join(ROOT_DIR, "useless_features_names.csv"))

In [10]:
useless_columns_indices = np.where(np.in1d(column_names, useless_features_names))[0]

In [11]:
x_train = np.delete(x_train, useless_columns_indices, axis=1)
x_test = np.delete(x_test, useless_columns_indices, axis=1)
column_names = np.delete(column_names, useless_columns_indices)

## Normalizing data, removing nans and outliers

In [12]:
def clean_outliers(data):
   """
   Cleans the dataset from outliers.
   The first quantile and last quantile outliers are replaced by the median without outliers.
   """
   cleaned_data = np.copy(data)
  
   for i in range(data.shape[1]):  # Iterate over features/columns
       feature = data[:, i]
      
       # Compute the first and last 5% and IQR
       Q1 = np.nanpercentile(feature, 25)
       Q3 = np.nanpercentile(feature, 75)
       IQR = Q3 - Q1
      
       # Identify the outliers
       outlier_mask = (feature < (Q1 - 1.5 * IQR)) | (feature > (Q3 + 1.5 * IQR))
      
       # Compute the median of the data without outliers
       median_without_outliers = np.nanmedian(feature[~outlier_mask])
      
       # Replace outliers with this median
       cleaned_data[outlier_mask, i] = median_without_outliers
      
   return cleaned_data


In [13]:
x_train = clean_outliers(x_train)
x_test = clean_outliers(x_test)

In [14]:
x_train_averages = np.nanmean(x_train, axis=0)
x_test_averages = np.nanmean(x_test, axis=0)

In [15]:
# Replace nan values with the computed means for each feature
for i in range(x_train.shape[1]):
   x_train[np.isnan(x_train[:, i]), i] = x_train_averages[i]
   
# Replace nan values with the computed means for each feature
for i in range(x_test.shape[1]):
   x_test[np.isnan(x_test[:, i]), i] = x_test_averages[i]


In [16]:
x_train_std_dev = np.std(x_train, axis=0)
x_train = np.delete(x_train, np.where(x_train_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_train_std_dev == 0), axis=1)
x_test_std_dev = np.std(x_test, axis=0)
x_train = np.delete(x_train, np.where(x_test_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_test_std_dev == 0), axis=1)

In [17]:
# Z-score normalization
def z_score_normalization(data):
   mean_vals = np.mean(data, axis=0)
   std_dev = np.std(data, axis=0)
   return (data - mean_vals) / std_dev


x_train = z_score_normalization(x_train)
x_test = z_score_normalization(x_test)

In [18]:
x_train

array([[-1.61190630e+00, -1.66692700e+00,  1.18089740e+00, ...,
         0.00000000e+00, -1.77600910e-15,  6.66941669e-01],
       [ 7.36126946e-01,  6.49440401e-01,  5.20900978e-16, ...,
         0.00000000e+00, -1.77600910e-15, -1.55128811e-15],
       [ 4.39249180e-01,  3.42451951e-01,  1.18089740e+00, ...,
        -1.18082469e+00,  3.08957382e+00,  6.66941669e-01],
       ...,
       [ 7.36126946e-01, -1.77855916e+00,  1.18089740e+00, ...,
         2.38003972e+00,  3.08957382e+00,  6.66941669e-01],
       [ 7.36126946e-01,  6.49440401e-01,  5.20900978e-16, ...,
         0.00000000e+00, -1.77600910e-15,  6.66941669e-01],
       [-1.44997297e+00, -1.61111091e+00, -1.97144530e+00, ...,
         0.00000000e+00, -1.77600910e-15,  6.66941669e-01]])

# Machine learning

## Splitting train sets in two

In [19]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

In [20]:
# Ridge Regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
lambda_value = [0,0,0,0,0]
for lambda_ in [0.00001, 0.0001, 0.001, 0.01, 0.1]:
    w, loss = ridge_regression(y_train1, x_train1, lambda_)
    y_pred1 = x_test1 @ w
    y_pred1[y_pred1 >= 0] = 1
    y_pred1[y_pred1 < 0] = -1
    f1 = f1_score(y_test1, y_pred1)
    accuracy = accuracy_score(y_test1,y_pred1)
    total = f1 + accuracy
    if total > min(best_total):
        index = best_total.index(min(best_total))
        best_total[index] = total
        best_accuracy[index] = accuracy
        best_f1_score[index] = f1
        lambda_value[index] = lambda_

    
print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"lambda values: {lambda_value}")

F1_score: [0.2939978768946975, 0.29401120506663886, 0.29384973750999543, 0.29220499197902466, 0.28857904864791867]
Accuracy: [0.6253497714364871, 0.6252851272106017, 0.6248603222976404, 0.6210647827492266, 0.6107863508334488]
lambda values: [1e-05, 0.0001, 0.001, 0.01, 0.1]


In [21]:
w, loss = ridge_regression(y_train1, x_train1, lambda_/10000)
y_pred1 = x_test1 @ w
y_pred1[y_pred1 >= 0.23] = 1
y_pred1[y_pred1 < 0.23] = -1
f1 = f1_score(y_test1, y_pred1)
accuracy = accuracy_score(y_test1,y_pred1)
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")

F1_score: 0.3813316835511561
Accuracy: 0.8522417694048113


In [22]:
# Logistic Regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
best_max_iters = [0,0,0,0,0]
best_gamma = [0,0,0,0,0]
initial_w = np.zeros(x_train1.shape[1])
for gamma in [0.0001, 0.001, 0.01, 0.1]:
    for max_iters in [1000, 5000, 10000]:
        w, loss = logistic_regression(y_train1, x_train1, initial_w, max_iters, gamma)
        y_pred1 = x_test1 @ w
        y_pred1[y_pred1 >= 0] = 1
        y_pred1[y_pred1 < 0] = -1
        f1 = f1_score(y_test1, y_pred1)
        accuracy = accuracy_score(y_test1,y_pred1)
        total = f1 + accuracy
        if total > min(best_total):
            index = best_total.index(min(best_total))
            best_total[index] = total
            best_accuracy[index] = accuracy
            best_f1_score[index] = f1
            best_gamma[index] = gamma
            best_max_iters[index] = max_iters
        if max_iters == 10000:
            print(f"Gamma = {gamma} done...")

    
print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"Max iters: {best_max_iters}")
print(f"Gamma: {best_gamma}")

KeyboardInterrupt: 

In [None]:
# Reg Logistic regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
lambda_value = [0,0,0,0,0]
best_max_iters = [0,0,0,0,0]
best_gamma = [0,0,0,0,0]
initial_w = np.zeros(x_train1.shape[1])
for lambda_ in [0.0001, 0.001, 0.01, 0.1]:
    for max_iters in [1000, 5000, 10000]:
        for gamma in [0.0001, 0.001, 0.01, 0.1]:
            w, loss = reg_logistic_regression(y_train1, x_train1, lambda_, initial_w, max_iters, gamma)
            y_pred1 = x_test1 @ w
            y_pred1[y_pred1 >= 0] = 1
            y_pred1[y_pred1 < 0] = -1
            f1 = f1_score(y_test1, y_pred1)
            accuracy = accuracy_score(y_test1,y_pred1)
            total = f1 + accuracy
            if total > min(best_total):
                index = best_total.index(min(best_total))
                best_total[index] = total
                best_accuracy[index] = accuracy
                best_f1_score[index] = f1
                lambda_value[index] = lambda_
                best_max_iters[index] = max_iters
                best_gamma[index] = gamma
            if max_iters == 10000 and gamma == 0.1:
            print(f"Lambda = {lambda_} done...")

print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"lambda values: {lambda_value}")
print(f"Max iters: {best_max_iters}")
print(f"Gamma: {best_gamma}")

## Reg logistic regression

In [None]:
lambda_ = 0.1
initial_w = np.zeros(x_train.shape[1])
max_iters = 1000
gamma = 0.01

In [None]:
ridge_regression_w, ridge_regression_loss = ridge_regression(y_train, x_train, lambda_)

In [None]:
reg_logistic_regression_w, reg_logistic_regression_loss = reg_logistic_regression(y_train, x_train, lambda_, initial_w, max_iters, gamma)

In [None]:
least_squares_w, least_squares_loss = least_squares(y_train, x_train)

In [None]:
least_squares_y_pred = x_test @ least_squares_w
least_squares_y_pred[least_squares_y_pred >= 0] = 1
least_squares_y_pred[least_squares_y_pred < 0] = -1

In [None]:
create_csv_submission(test_ids, least_squares_y_pred, "least_squares_test02.csv")

In [None]:
reg_logistic_regression_y_pred = x_test @ reg_logistic_regression_w
reg_logistic_regression_y_pred[reg_logistic_regression_y_pred >= 0] = 1
reg_logistic_regression_y_pred[reg_logistic_regression_y_pred < 0] = -1

In [None]:
reg_logistic_regression_y_pred

In [None]:
create_csv_submission(test_ids, reg_logistic_regression_y_pred, "reg_logistic_regression_test02.csv")


In [None]:
ridge_regression_y_pred = x_test @ ridge_regression_w
ridge_regression_y_pred[ridge_regression_y_pred >= 0] = 1
ridge_regression_y_pred[ridge_regression_y_pred < 0] = -1

In [None]:
create_csv_submission(test_ids, ridge_regression_y_pred, "ridge_regression_test02.csv")
