In [33]:
import sys
import os
from helpers import load_csv_data, create_csv_submission
from run_helpers import load_useless_features_file, get_pearson_coefficients, get_spearman_coefficients, load_column_names_by_type, clean_data
from implementations import least_squares, reg_logistic_regression, ridge_regression, logistic_regression
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [34]:
ROOT_DIR = os.path.abspath(os.curdir)
DATA_DIR = os.path.join(ROOT_DIR, "data")
PREDICTIONS_DIR = os.path.join(ROOT_DIR, "predictions")
HELPER_FILES_DIR = os.path.join(ROOT_DIR, "helper_files")

In [35]:
x_train_initial, x_test_initial, y_train_initial, train_ids_initial, test_ids_initial, column_names_initial = load_csv_data(DATA_DIR)

In [36]:
x_train, x_test, y_train, train_ids, test_ids, column_names = x_train_initial.copy(), x_test_initial.copy(), y_train_initial.copy(), train_ids_initial.copy(), test_ids_initial.copy(), column_names_initial.copy()

In [37]:
bools, seven_nines, seventyseven_ninetynine, specials, eight, eithgy_eight, fruits = load_column_names_by_type(os.path.join(HELPER_FILES_DIR, "variables_by_values.csv"))
x_train = clean_data(x_train, column_names, bools, seven_nines, seventyseven_ninetynine, specials, eight, eithgy_eight, fruits)
x_test = clean_data(x_test, column_names, bools, seven_nines, seventyseven_ninetynine, specials, eight, eithgy_eight, fruits)

In [38]:
x_test

array([[4.4000000e+01, 2.0000000e+00, 2.0820150e+06, ..., 1.0000000e+00,
        1.0000000e+00, 2.0000000e+00],
       [2.7000000e+01, 1.0000000e+00, 1.1920150e+06, ...,           nan,
                  nan, 2.0000000e+00],
       [3.5000000e+01, 5.0000000e+00, 5.2620150e+06, ..., 1.0000000e+00,
        1.0000000e+00, 2.0000000e+00],
       ...,
       [9.0000000e+00, 1.1000000e+01, 1.1272015e+07, ..., 9.0000000e+00,
        9.0000000e+00,           nan],
       [1.5000000e+01, 1.2000000e+01, 1.2122015e+07, ..., 1.0000000e+00,
        1.0000000e+00, 2.0000000e+00],
       [4.5000000e+01, 1.2000000e+01, 1.2282015e+07, ...,           nan,
                  nan, 2.0000000e+00]])

In [39]:
column_names

array(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES',
       'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1',
       'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE',
       'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS',
       'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW',
       'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2',
       'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3',
       'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3',
       'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK',
       'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2',
       'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5',
    

# Data cleaning

## Removing useless features

In [40]:
useless_features_names = load_useless_features_file(os.path.join(HELPER_FILES_DIR, "useless_features_names.csv"))

In [41]:
useless_columns_indices = np.where(np.in1d(column_names, useless_features_names))[0]

In [42]:
column_names

array(['_STATE', 'FMONTH', 'IDATE', 'IMONTH', 'IDAY', 'IYEAR', 'DISPCODE',
       'SEQNO', '_PSU', 'CTELENUM', 'PVTRESD1', 'COLGHOUS', 'STATERES',
       'CELLFON3', 'LADULT', 'NUMADULT', 'NUMMEN', 'NUMWOMEN', 'CTELNUM1',
       'CELLFON2', 'CADULT', 'PVTRESD2', 'CCLGHOUS', 'CSTATE', 'LANDLINE',
       'HHADULT', 'GENHLTH', 'PHYSHLTH', 'MENTHLTH', 'POORHLTH',
       'HLTHPLN1', 'PERSDOC2', 'MEDCOST', 'CHECKUP1', 'BPHIGH4', 'BPMEDS',
       'BLOODCHO', 'CHOLCHK', 'TOLDHI2', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW',
       'CHCSCNCR', 'CHCOCNCR', 'CHCCOPD1', 'HAVARTH3', 'ADDEPEV2',
       'CHCKIDNY', 'DIABETE3', 'DIABAGE2', 'SEX', 'MARITAL', 'EDUCA',
       'RENTHOM1', 'NUMHHOL2', 'NUMPHON2', 'CPDEMO1', 'VETERAN3',
       'EMPLOY1', 'CHILDREN', 'INCOME2', 'INTERNET', 'WEIGHT2', 'HEIGHT3',
       'PREGNANT', 'QLACTLM2', 'USEEQUIP', 'BLIND', 'DECIDE', 'DIFFWALK',
       'DIFFDRES', 'DIFFALON', 'SMOKE100', 'SMOKDAY2', 'STOPSMK2',
       'LASTSMK2', 'USENOW3', 'ALCDAY5', 'AVEDRNK2', 'DRNK3GE5',
    

In [43]:
x_train = np.delete(x_train, useless_columns_indices, axis=1)
x_test = np.delete(x_test, useless_columns_indices, axis=1)
column_names = np.delete(column_names, useless_columns_indices)

## Normalizing data, removing nans and outliers

In [44]:
def clean_outliers(data):
   """
   Cleans the dataset from outliers.
   The first quantile and last quantile outliers are replaced by the median without outliers.
   """
   cleaned_data = np.copy(data)
  
   for i in range(data.shape[1]):  # Iterate over features/columns
       feature = data[:, i]
      
       # Compute the first and last 3% and IQR
       Q1 = np.nanpercentile(feature, 3)
       Q3 = np.nanpercentile(feature, 97)
       IQR = Q3 - Q1
      
       # Identify the outliers
       outlier_mask = (feature < (Q1 - 1.5 * IQR)) | (feature > (Q3 + 1.5 * IQR))
      
       # Compute the median of the data without outliers
       median_without_outliers = np.nanmedian(feature[~outlier_mask])
      
       # Replace outliers with this median
       cleaned_data[outlier_mask, i] = median_without_outliers
      
   return cleaned_data


In [45]:
x_train = clean_outliers(x_train)
x_test = clean_outliers(x_test)

In [46]:
x_train

array([[ 2.,  1.,  5., ..., nan, nan,  2.],
       [ 4., 88., 88., ..., nan, nan, nan],
       [ 2., 77., 77., ...,  1.,  2.,  2.],
       ...,
       [ 3., 88.,  1., ...,  2.,  2.,  2.],
       [ 3., 88., 88., ..., nan, nan,  2.],
       [ 2.,  7.,  7., ..., nan, nan,  2.]])

In [47]:
def remove_small_variance_features(data_train, data_test):
    cleaned_data_train = np.copy(data_train)
    cleaned_data_test = np.copy(data_test)
    # Calculate the variance for each feature
    variances_over_means = np.abs(np.nanvar(cleaned_data_train, axis=0)
                                  /
                                  np.nanmean(cleaned_data_train, axis=0)
                                  )
    
    # Set your threshold for variance (e.g., 0.01)
    threshold = 0.01
    
    # Find feature indices that meet the threshold
    features_to_keep = variances_over_means >= threshold
    
    # Keep only the features with variance above the threshold
    data_reduced_train = cleaned_data_train[:, features_to_keep]
    data_reduced_test = cleaned_data_test[:, features_to_keep]
    
    return data_reduced_train, data_reduced_test
        
        

In [48]:
x_train, x_test = remove_small_variance_features(x_train, x_test)


In [49]:
x_train

array([[ 2.,  1.,  5., ..., nan, nan,  2.],
       [ 4., 88., 88., ..., nan, nan, nan],
       [ 2., 77., 77., ...,  1.,  2.,  2.],
       ...,
       [ 3., 88.,  1., ...,  2.,  2.,  2.],
       [ 3., 88., 88., ..., nan, nan,  2.],
       [ 2.,  7.,  7., ..., nan, nan,  2.]])

In [50]:
x_train_averages = np.nanmean(x_train, axis=0)
x_test_averages = np.nanmean(x_test, axis=0)

In [51]:
# Replace nan values with the computed means for each feature
for i in range(x_train.shape[1]):
   x_train[np.isnan(x_train[:, i]), i] = x_train_averages[i]
   
# Replace nan values with the computed means for each feature
for i in range(x_test.shape[1]):
   x_test[np.isnan(x_test[:, i]), i] = x_test_averages[i]


In [52]:
x_train_std_dev = np.std(x_train, axis=0)
x_train = np.delete(x_train, np.where(x_train_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_train_std_dev == 0), axis=1)
x_test_std_dev = np.std(x_test, axis=0)
x_train = np.delete(x_train, np.where(x_test_std_dev == 0), axis=1)
x_test = np.delete(x_test, np.where(x_test_std_dev == 0), axis=1)

In [53]:
# Z-score normalization
def z_score_normalization(data):
   mean_vals = np.mean(data, axis=0)
   std_dev = np.std(data, axis=0)
   return (data - mean_vals) / std_dev


x_train = z_score_normalization(x_train)
x_test = z_score_normalization(x_test)

In [54]:
x_train

array([[-5.13164612e-01, -1.61190630e+00, -1.66692700e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [ 1.28108930e+00,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  0.00000000e+00],
       [-5.13164612e-01,  4.39249180e-01,  3.42451951e-01, ...,
        -8.59212775e-01, -2.46031587e-01,  2.44635873e-02],
       ...,
       [ 3.83962342e-01,  7.36126946e-01, -1.77855916e+00, ...,
        -1.93109791e-01, -2.46031587e-01,  2.44635873e-02],
       [ 3.83962342e-01,  7.36126946e-01,  6.49440401e-01, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02],
       [-5.13164612e-01, -1.44997297e+00, -1.61111091e+00, ...,
         0.00000000e+00,  5.37176464e-16,  2.44635873e-02]])

In [55]:
def remove_features_with_small_pearson_correlation(data_train, target_train, data_test):
    cleaned_data_train = np.copy(data_train)
    cleaned_data_test = np.copy(data_test)
    
    pearson_coeffs = get_pearson_coefficients(data_train, target_train)
    
    threshold = 0.01
    
    # Find feature indices that meet the threshold
    features_to_keep = np.abs(pearson_coeffs) >= threshold
    
    # Keep only the features with variance above the threshold
    data_reduced_train = cleaned_data_train[:, features_to_keep]
    data_reduced_test = cleaned_data_test[:, features_to_keep]
    
    return data_reduced_train, data_reduced_test
    

In [56]:
def remove_features_with_small_spearman_correlation(data_train, target_train, data_test):
    cleaned_data_train = np.copy(data_train)
    cleaned_data_test = np.copy(data_test)
    
    spearman_coeffs = get_spearman_coefficients(data_train, target_train)

    threshold = 0.01
    
    # Find feature indices that meet the threshold
    features_to_keep = np.abs(spearman_coeffs) >= threshold
    
    # Keep only the features with variance above the threshold
    data_reduced_train = cleaned_data_train[:, features_to_keep]
    data_reduced_test = cleaned_data_test[:, features_to_keep]
    
    return data_reduced_train, data_reduced_test

In [57]:
x_train, x_test = remove_features_with_small_pearson_correlation(x_train, y_train, x_test)
x_train, x_test = remove_features_with_small_spearman_correlation(x_train, y_train, x_test)

In [58]:
x_test

array([[-5.13108560e-01,  7.35319805e-01, -2.60790151e-16, ...,
        -8.52901726e-01, -8.46366394e-01,  2.33658706e-02],
       [-1.41237633e+00, -1.55854667e+00,  1.17838179e+00, ...,
        -5.98026484e-16,  2.70090630e-16,  2.33658706e-02],
       [-5.13108560e-01,  7.35319805e-01, -2.60790151e-16, ...,
        -8.52901726e-01, -8.46366394e-01,  2.33658706e-02],
       ...,
       [-5.13108560e-01,  7.35319805e-01,  1.17838179e+00, ...,
         4.53364197e+00,  4.01915384e+00,  0.00000000e+00],
       [ 1.28542697e+00, -1.55854667e+00,  1.17838179e+00, ...,
        -8.52901726e-01, -8.46366394e-01,  2.33658706e-02],
       [ 3.86159205e-01, -1.58553334e+00,  1.17838179e+00, ...,
        -5.98026484e-16,  2.70090630e-16,  2.33658706e-02]])

# Machine learning

## Splitting train sets in two

In [59]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

## Running each algo once

In [60]:
THRESHOLD = 0
create_csv = False

In [61]:
# Ridge regression
lambda_ = 0.0001
w, loss = ridge_regression(y_train1, x_train1, lambda_)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Ridge Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, "ridge_regression_test_03.csv")

Current model: Ridge Regression
F1_score: 0.3195850733176973
Accuracy: 0.6704714411044927


In [62]:
# Logistic regression
max_iters = 100
gamma = 0.1
initial_w = np.zeros(x_train.shape[1])
w, loss = logistic_regression(y_train1, x_train1, initial_w=initial_w,max_iters=max_iters, gamma=gamma)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Logistic Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "logistic_regression_test_03.csv"))

Current model: Logistic Regression
F1_score: 0.3207653130385949
Accuracy: 0.67608625386711


In [66]:
# Reg Logistic regression
lambda_ = 0.0001
max_iters = 1000000
gamma = 0.5
initial_w = np.zeros(x_train.shape[1])
w, loss = reg_logistic_regression(y_train1, x_train1, lambda_=lambda_, initial_w=initial_w, max_iters=max_iters, gamma=gamma)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Reg Logistic Regression")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "reg_logistic_regression_test_03.csv"))

Current model: Reg Logistic Regression
F1_score: 0.34972899728997286
Accuracy: 0.7340905942651337


In [68]:
create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "reg_logistic_regression_test_03.csv"))

In [64]:
# Least Squares
w, loss = least_squares(y_train1, x_train1)
y_pred = x_test1 @ w
y_pred[y_pred >= THRESHOLD] = 1
y_pred[y_pred < THRESHOLD] = -1
f1 = f1_score(y_test1, y_pred)
accuracy = accuracy_score(y_test1,y_pred)
print(f"Current model: Least Squares")
print(f"F1_score: {f1}")
print(f"Accuracy: {accuracy}")
if create_csv:
    create_csv_submission(test_ids, y_pred, os.path.join(PREDICTIONS_DIR, "least_squares_test_03.csv"))

Current model: Least Squares
F1_score: 0.3198626871364546
Accuracy: 0.6706561388927368
