In [1]:
import sys
import os
from helpers import load_csv_data, create_csv_submission
from run_helpers import load_useless_features_file
from implementations import least_squares, reg_logistic_regression, ridge_regression, logistic_regression, mean_squared_error_gd, mean_squared_error_sgd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
ROOT_DIR = os.path.abspath(os.curdir)
DATA_DIR = os.path.join(ROOT_DIR, "data")
PREDICTIONS_DIR = os.path.join(ROOT_DIR, "predictions")
HELPER_FILES_DIR = os.path.join(ROOT_DIR, "helper_files")

In [3]:
x_train_initial, x_test_initial, y_train_initial, train_ids_initial, test_ids_initial, column_names_initial = load_csv_data(DATA_DIR)

In [45]:
x_train, x_test, y_train, train_ids, test_ids, column_names = x_train_initial.copy(), x_test_initial.copy(), y_train_initial.copy(), train_ids_initial.copy(), test_ids_initial.copy(), column_names_initial.copy()

In [46]:
x_train

array([[5.3000000e+01, 1.1000000e+01, 1.1162015e+07, ...,           nan,
                  nan, 2.0000000e+00],
       [3.3000000e+01, 1.2000000e+01, 1.2152015e+07, ...,           nan,
                  nan,           nan],
       [2.0000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 1.0000000e+00,
        2.0000000e+00, 2.0000000e+00],
       ...,
       [3.9000000e+01, 1.0000000e+01, 1.0202015e+07, ..., 2.0000000e+00,
        2.0000000e+00, 2.0000000e+00],
       [3.3000000e+01, 1.2000000e+01, 1.2302015e+07, ...,           nan,
                  nan, 2.0000000e+00],
       [3.2000000e+01, 9.0000000e+00, 9.1220150e+06, ...,           nan,
                  nan, 2.0000000e+00]])

In [47]:
column_names

array(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES',
       'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1',
       'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE',
       'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS',
       'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW',
       'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2',
       'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3',
       'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3',
       'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK',
       'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2',
       'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5',
    

# Data cleaning

## Removing useless features

In [48]:
useless_features_names = load_useless_features_file(os.path.join(HELPER_FILES_DIR, "useless_features_names.csv"))

In [49]:
useless_columns_indices = np.where(np.in1d(column_names, useless_features_names))[0]

In [50]:
column_names

array(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES',
       'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1',
       'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE',
       'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS',
       'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW',
       'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2',
       'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3',
       'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3',
       'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK',
       'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2',
       'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5',
    

In [51]:
x_train = np.delete(x_train, useless_columns_indices, axis=1)
x_test = np.delete(x_test, useless_columns_indices, axis=1)
column_names = np.delete(column_names, useless_columns_indices)

## Normalizing data, removing nans and outliers

In [52]:
def clean_outliers(data):
   """
   Cleans the dataset from outliers.
   The first quantile and last quantile outliers are replaced by the median without outliers.
   """
   cleaned_data = np.copy(data)
  
   for i in range(data.shape[1]):  # Iterate over features/columns
       feature = data[:, i]
      
       # Compute the first and last 3% and IQR
       Q1 = np.nanpercentile(feature, 3)
       Q3 = np.nanpercentile(feature, 97)
       IQR = Q3 - Q1
      
       # Identify the outliers
       outlier_mask = (feature < (Q1 - 1.5 * IQR)) | (feature > (Q3 + 1.5 * IQR))
      
       # Compute the median of the data without outliers
       median_without_outliers = np.nanmedian(feature[~outlier_mask])
      
       # Replace outliers with this median
       cleaned_data[outlier_mask, i] = median_without_outliers
      
   return cleaned_data


In [53]:
x_train = clean_outliers(x_train)
x_test = clean_outliers(x_test)

In [69]:
x_train

array([[-5.13164612e-01, -1.61190630e+00, -1.66692700e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [ 1.28108930e+00,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  0.00000000e+00],
       [-5.13164612e-01,  4.39249180e-01,  3.42451951e-01, ...,
        -8.59212775e-01, -2.46031587e-01,  2.44635873e-02],
       ...,
       [ 3.83962342e-01,  7.36126946e-01, -1.77855916e+00, ...,
        -1.93109791e-01, -2.46031587e-01,  2.44635873e-02],
       [ 3.83962342e-01,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [-5.13164612e-01, -1.44997297e+00, -1.61111091e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02]])

In [72]:
def remove_small_variance_features(data):
    cleaned_data = np.copy(data)
    # Calculate the variance for each feature
    variances = np.var(cleaned_data, axis=0)
    means = np.mean(cleaned_data, axis=0)
    print(means)
    
    # Set your threshold for variance (e.g., 0.01)
    threshold = 0.01
    
    # Find feature indices that meet the threshold
    features_to_keep = (variances/means) > threshold
    
    # Keep only the features with variance above the threshold
    data_reduced = cleaned_data[:, features_to_keep]
    
    return data_reduced
        
        

In [73]:
remove_small_variance_features(x_train)

[-2.64024220e-16 -1.20214754e-16  1.04220586e-16  5.72617737e-16
  1.28559654e-16  9.98085881e-17 -5.52934273e-17  9.65225972e-17
  1.35792082e-16 -8.62965072e-16  3.38386680e-16 -8.34111149e-17
  6.22660075e-17 -3.16878869e-17  2.68815163e-16 -1.88389590e-18
 -4.04701981e-16 -8.14460166e-18 -3.46355343e-16  7.73804826e-17
 -3.87173088e-17 -1.76912982e-17 -4.31444641e-16  1.09572366e-15
 -8.56198204e-17  1.14549532e-16  1.85455476e-16 -5.34095314e-17
 -2.40574589e-15 -5.59180362e-15  5.36823715e-16 -1.11712861e-16
  1.65111566e-17  2.83667083e-17 -1.46229299e-16 -4.00436148e-16
 -5.06053415e-17  3.84499904e-15 -2.95206487e-15  1.53862325e-16
 -9.72101110e-16  1.21435496e-16  1.26585353e-15 -1.03686003e-15
 -2.43142750e-15 -5.63912838e-16 -4.08902853e-16 -9.78045126e-16
 -1.28181793e-15 -1.20720915e-15 -5.27880623e-16 -2.08787635e-16
  2.92301606e-16 -5.68466940e-16 -3.65302572e-17 -6.53295037e-16
  7.46574952e-17  6.92992188e-16 -5.96377562e-16 -1.09607012e-16
 -1.07317104e-16 -1.25879

array([[-1.66692700e+00,  1.18089740e+00, -2.79395064e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [ 6.49440401e-01,  5.20900978e-16, -2.79395064e-01, ...,
         0.00000000e+00,  5.37176464e-16,  0.00000000e+00],
       [ 3.42451951e-01,  1.18089740e+00, -2.79395064e-01, ...,
        -8.59212775e-01, -2.46031587e-01,  2.44635873e-02],
       ...,
       [-1.77855916e+00,  1.18089740e+00, -2.79395064e-01, ...,
        -1.93109791e-01, -2.46031587e-01,  2.44635873e-02],
       [ 6.49440401e-01,  5.20900978e-16, -2.79395064e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [-1.61111091e+00, -1.97144530e+00, -2.79395064e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02]])

In [54]:
x_train_averages = np.nanmean(x_train, axis=0)
x_test_averages = np.nanmean(x_test, axis=0)

In [55]:
# Replace nan values with the computed means for each feature
for i in range(x_train.shape[1]):
   x_train[np.isnan(x_train[:, i]), i] = x_train_averages[i]
   
# Replace nan values with the computed means for each feature
for i in range(x_test.shape[1]):
   x_test[np.isnan(x_test[:, i]), i] = x_test_averages[i]


In [56]:
x_train_std_dev = np.std(x_train, axis=0)
x_train = np.delete(x_train, np.where(x_train_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_train_std_dev == 0), axis=1)
x_test_std_dev = np.std(x_test, axis=0)
x_train = np.delete(x_train, np.where(x_test_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_test_std_dev == 0), axis=1)

In [57]:
# Z-score normalization
def z_score_normalization(data):
   mean_vals = np.mean(data, axis=0)
   std_dev = np.std(data, axis=0)
   return (data - mean_vals) / std_dev


x_train = z_score_normalization(x_train)
x_test = z_score_normalization(x_test)

In [58]:
x_train

array([[-5.13164612e-01, -1.61190630e+00, -1.66692700e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [ 1.28108930e+00,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  0.00000000e+00],
       [-5.13164612e-01,  4.39249180e-01,  3.42451951e-01, ...,
        -8.59212775e-01, -2.46031587e-01,  2.44635873e-02],
       ...,
       [ 3.83962342e-01,  7.36126946e-01, -1.77855916e+00, ...,
        -1.93109791e-01, -2.46031587e-01,  2.44635873e-02],
       [ 3.83962342e-01,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [-5.13164612e-01, -1.44997297e+00, -1.61111091e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02]])

# Machine learning

## Splitting train sets in two

In [59]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

In [60]:
"""
# Ridge Regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
lambda_value = [0,0,0,0,0]
for lambda_ in [0.00001, 0.0001, 0.001, 0.01, 0.1]:
    w, loss = ridge_regression(y_train1, x_train1, lambda_)
    y_pred1 = x_test1 @ w
    y_pred1[y_pred1 >= 0] = 1
    y_pred1[y_pred1 < 0] = -1
    f1 = f1_score(y_test1, y_pred1)
    accuracy = accuracy_score(y_test1,y_pred1)
    total = f1 + accuracy
    if total > min(best_total):
        index = best_total.index(min(best_total))
        best_total[index] = total
        best_accuracy[index] = accuracy
        best_f1_score[index] = f1
        lambda_value[index] = lambda_

    
print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"lambda values: {lambda_value}")
"""

'\n# Ridge Regression\nbest_f1_score = [0,0,0,0,0]\nbest_accuracy = [0,0,0,0,0]\nbest_total = [0,0,0,0,0]\nlambda_value = [0,0,0,0,0]\nfor lambda_ in [0.00001, 0.0001, 0.001, 0.01, 0.1]:\n    w, loss = ridge_regression(y_train1, x_train1, lambda_)\n    y_pred1 = x_test1 @ w\n    y_pred1[y_pred1 >= 0] = 1\n    y_pred1[y_pred1 < 0] = -1\n    f1 = f1_score(y_test1, y_pred1)\n    accuracy = accuracy_score(y_test1,y_pred1)\n    total = f1 + accuracy\n    if total > min(best_total):\n        index = best_total.index(min(best_total))\n        best_total[index] = total\n        best_accuracy[index] = accuracy\n        best_f1_score[index] = f1\n        lambda_value[index] = lambda_\n\n    \nprint(f"F1_score: {best_f1_score}")\nprint(f"Accuracy: {best_accuracy}")\nprint(f"lambda values: {lambda_value}")\n'

In [61]:
"""
w, loss = ridge_regression(y_train1, x_train1, lambda_/10000)
y_pred1 = x_test1 @ w
y_pred1[y_pred1 >= 0.23] = 1
y_pred1[y_pred1 < 0.23] = -1
f1 = f1_score(y_test1, y_pred1)
accuracy = accuracy_score(y_test1,y_pred1)
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
"""

'\nw, loss = ridge_regression(y_train1, x_train1, lambda_/10000)\ny_pred1 = x_test1 @ w\ny_pred1[y_pred1 >= 0.23] = 1\ny_pred1[y_pred1 < 0.23] = -1\nf1 = f1_score(y_test1, y_pred1)\naccuracy = accuracy_score(y_test1,y_pred1)\nprint(f"F1_score: {f1}")\nprint(f"Accuracy: {accuracy}")\n'

In [62]:
"""
# Logistic Regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
best_max_iters = [0,0,0,0,0]
best_gamma = [0,0,0,0,0]
initial_w = np.zeros(x_train1.shape[1])
for gamma in [0.0001, 0.001, 0.01, 0.1]:
    for max_iters in [1000, 5000, 10000]:
        w, loss = logistic_regression(y_train1, x_train1, initial_w, max_iters, gamma)
        y_pred1 = x_test1 @ w
        y_pred1[y_pred1 >= 0] = 1
        y_pred1[y_pred1 < 0] = -1
        f1 = f1_score(y_test1, y_pred1)
        accuracy = accuracy_score(y_test1,y_pred1)
        total = f1 + accuracy
        if total > min(best_total):
            index = best_total.index(min(best_total))
            best_total[index] = total
            best_accuracy[index] = accuracy
            best_f1_score[index] = f1
            best_gamma[index] = gamma
            best_max_iters[index] = max_iters
        if max_iters == 10000:
            print(f"Gamma = {gamma} done...")

    
print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"Max iters: {best_max_iters}")
print(f"Gamma: {best_gamma}")
"""

'\n# Logistic Regression\nbest_f1_score = [0,0,0,0,0]\nbest_accuracy = [0,0,0,0,0]\nbest_total = [0,0,0,0,0]\nbest_max_iters = [0,0,0,0,0]\nbest_gamma = [0,0,0,0,0]\ninitial_w = np.zeros(x_train1.shape[1])\nfor gamma in [0.0001, 0.001, 0.01, 0.1]:\n    for max_iters in [1000, 5000, 10000]:\n        w, loss = logistic_regression(y_train1, x_train1, initial_w, max_iters, gamma)\n        y_pred1 = x_test1 @ w\n        y_pred1[y_pred1 >= 0] = 1\n        y_pred1[y_pred1 < 0] = -1\n        f1 = f1_score(y_test1, y_pred1)\n        accuracy = accuracy_score(y_test1,y_pred1)\n        total = f1 + accuracy\n        if total > min(best_total):\n            index = best_total.index(min(best_total))\n            best_total[index] = total\n            best_accuracy[index] = accuracy\n            best_f1_score[index] = f1\n            best_gamma[index] = gamma\n            best_max_iters[index] = max_iters\n        if max_iters == 10000:\n            print(f"Gamma = {gamma} done...")\n\n    \nprint(f

In [63]:
"""
# Reg Logistic regression
best_f1_score = [0,0,0,0,0]
best_accuracy = [0,0,0,0,0]
best_total = [0,0,0,0,0]
lambda_value = [0,0,0,0,0]
best_max_iters = [0,0,0,0,0]
best_gamma = [0,0,0,0,0]
initial_w = np.zeros(x_train1.shape[1])
for lambda_ in [0.0001, 0.001, 0.01, 0.1]:
    for max_iters in [1000, 5000, 10000]:
        for gamma in [0.0001, 0.001, 0.01, 0.1]:
            w, loss = reg_logistic_regression(y_train1, x_train1, lambda_, initial_w, max_iters, gamma)
            y_pred1 = x_test1 @ w
            y_pred1[y_pred1 >= 0] = 1
            y_pred1[y_pred1 < 0] = -1
            f1 = f1_score(y_test1, y_pred1)
            accuracy = accuracy_score(y_test1,y_pred1)
            total = f1 + accuracy
            if total > min(best_total):
                index = best_total.index(min(best_total))
                best_total[index] = total
                best_accuracy[index] = accuracy
                best_f1_score[index] = f1
                lambda_value[index] = lambda_
                best_max_iters[index] = max_iters
                best_gamma[index] = gamma
            if max_iters == 10000 and gamma == 0.1:
                print(f"Lambda = {lambda_} done...")

print(f"F1_score: {best_f1_score}")
print(f"Accuracy: {best_accuracy}")
print(f"lambda values: {lambda_value}")
print(f"Max iters: {best_max_iters}")
print(f"Gamma: {best_gamma}")
"""

'\n# Reg Logistic regression\nbest_f1_score = [0,0,0,0,0]\nbest_accuracy = [0,0,0,0,0]\nbest_total = [0,0,0,0,0]\nlambda_value = [0,0,0,0,0]\nbest_max_iters = [0,0,0,0,0]\nbest_gamma = [0,0,0,0,0]\ninitial_w = np.zeros(x_train1.shape[1])\nfor lambda_ in [0.0001, 0.001, 0.01, 0.1]:\n    for max_iters in [1000, 5000, 10000]:\n        for gamma in [0.0001, 0.001, 0.01, 0.1]:\n            w, loss = reg_logistic_regression(y_train1, x_train1, lambda_, initial_w, max_iters, gamma)\n            y_pred1 = x_test1 @ w\n            y_pred1[y_pred1 >= 0] = 1\n            y_pred1[y_pred1 < 0] = -1\n            f1 = f1_score(y_test1, y_pred1)\n            accuracy = accuracy_score(y_test1,y_pred1)\n            total = f1 + accuracy\n            if total > min(best_total):\n                index = best_total.index(min(best_total))\n                best_total[index] = total\n                best_accuracy[index] = accuracy\n                best_f1_score[index] = f1\n                lambda_value[index]

## Running each algo once

In [64]:
THRESHOLD = 0
create_csv = False

In [65]:
# Ridge regression
lambda_ = 0.0001
w, loss = ridge_regression(y_train1, x_train1, lambda_)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Ridge Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "ridge_regression_test_03.csv"))

Current model: Ridge Regression
F1_score: 0.3190324488277599
Accuracy: 0.670342152652722


In [66]:
# Logistic regression
max_iters = 100
gamma = 0.1
initial_w = np.zeros(x_train.shape[1])
w, loss = logistic_regression(y_train1, x_train1, initial_w=initial_w,max_iters=max_iters, gamma=gamma)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Logistic Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "logistic_regression_test_03.csv"))

Current model: Logistic Regression
F1_score: 0.32052476323552237
Accuracy: 0.6766680519000785


In [67]:
# Reg Logistic regression
lambda_ = 0.0001
max_iters = 100
gamma = 0.1
initial_w = np.zeros(x_train.shape[1])
w, loss = reg_logistic_regression(y_train1, x_train1, lambda_=lambda_, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Reg Logistic Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "reg_logistic_regression_test_03.csv"))

Current model: Reg Logistic Regression
F1_score: 0.32049988356749204
Accuracy: 0.6766311123424297


In [68]:
# Least Squares
w, loss = least_squares(y_train1, x_train1)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Least Squares")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "least_squares_test_03.csv"))

Current model: Least Squares
F1_score: 0.31891964064318007
Accuracy: 0.6702498037586
