## Logistic Regression Training - Balanced

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score#, RocCurveDisplay
import matplotlib.pyplot as plt

# Use joblib to pickle the models (suggested by sklearn)
from joblib import dump, load

In [None]:
training_feats_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/features/normalized_preprocessing/regression_nn/yale_new_haven_balanced_training_features.csv"
training_labels_filepath = "~/scratch/datasets/yale_new_haven/training_test_sets/balanced_dataset/labels/yale_new_haven_balanced_training_labels.csv"

In [None]:
X_train = pd.read_csv(training_feats_filepath)
y_train = pd.read_csv(training_labels_filepath)

In [None]:
train_ids = X_train['ID'].astype('int32')
X_train = X_train[[col for col in X_train if col != 'ID']]

In [None]:
# sklearn's LogisticRegression wants a 1D array for the labels
y_train = y_train['0'].values

In [None]:
# all solvers (‘liblinear’, ‘lbfgs’, ‘newton-cg’, ‘sag’, ‘saga’) support l2 reg
# only liblinear and saga support l1 reg
# only sag and saga are fast for large datasets

# sag or saga might be best because they are good for large datasets with sparse features

solver = 'sag'        # learning algorithm
tol = 1e-4            # error tolerance
C = 1                 # inverse of regularization strength
fit_intercept = True  # whether constant is added for intercept
random_state = 2      # used when solver == ‘sag’, ‘saga’ or ‘liblinear’ to shuffle the data; if None, then it's random
max_iter = 1000

lr = LogisticRegression(solver='sag', tol=tol, C=C, fit_intercept=fit_intercept, random_state=random_state, max_iter=max_iter)

In [None]:
clf = lr.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf_filepath = "/home/mila/d/david.hobson/scratch/models/balanced/yale_reproduce/normalized_data/logistic_regresion_balanced.joblib"

In [None]:
dump(clf, clf_filepath) 