In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.utils.validation import column_or_1d

In [2]:
REDUCE_FEATURES = False

In [3]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../output/"
XtrainFilePath = os.path.join(scriptDir, relPath, "X_train_pp.csv")
ytrainFilePath = os.path.join(scriptDir, relPath, "y_train.csv")
XvalFilePath = os.path.join(scriptDir, relPath, "X_val_pp.csv")
yvalFilePath = os.path.join(scriptDir, relPath, "y_val.csv")
XtestFilePath = os.path.join(scriptDir, relPath, "X_test_pp.csv")
columnsNamesPath = os.path.join(scriptDir, relPath, "column_names.csv")
selectedColumnsNamesPath = os.path.join(scriptDir, relPath, "selectedColumns_Lasso.csv")

relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [4]:
columns = pd.read_csv(columnsNamesPath,header=None)
columns = list(columns[0])

In [5]:
# Load features
X_train = pd.read_csv(XtrainFilePath, delimiter=',',header=None, names=columns)

In [6]:
# Check dimensions as expected
X_train.shape

(2430981, 177)

In [7]:
if REDUCE_FEATURES:
    selectedColumns = pd.read_csv(selectedColumnsNamesPath,header=None)
    selectedColumns = list(selectedColumns[0])
    X_train = X_train[selectedColumns]
    print(X_train.shape)

In [8]:
# Load labels
y_train = pd.read_csv(ytrainFilePath, delimiter=',',header=None)

In [9]:
# Train logistic regression model
clf = LogisticRegression(solver='liblinear').fit(X_train, y_train[0].ravel())

In [10]:
# SAVE MODEL
#dump(clf, r"../output/logRes_3_liblinear_l2_20190228.joblib")
#clf = load(r"../output/logRes_3_liblinear_l2_20190228.joblib")

In [11]:
clf.get_params() # check params of loaded model, confirm

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [12]:
# Test on the training data
predictions = clf.predict(X_train)
predictionProb = clf.predict_proba(X_train) # note this gives probability for each class

In [14]:
np.sum(predictions)

54

In [15]:
np.sum(predictionProb[:,1])

1796.1144707583194

In [16]:
# histogram this?

In [18]:
accuracy = accuracy_score(y_train, predictions)
bAccuracy = balanced_accuracy_score(y_train, predictions)
precision = precision_score(y_train, predictions)
recall = recall_score(y_train, predictions)
f1 = f1_score(y_train, predictions)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_train, predictions), (4,))
rocAuc = roc_auc_score(y_train, predictions)
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp, rocAuc)

0.9992484515510405 0.5027795658962866 0.18518518518518517 0.005577244841048522 0.010828370330265295 2429144 44 1783 10 0.5027795658962865


In [19]:
#prc = precision_recall_curve(y_sample, predictionProb[:,1])

In [20]:
# Test on the validation data
X_val = pd.read_csv(XvalFilePath, delimiter=',',header=None, names=columns)
y_val = pd.read_csv(yvalFilePath, delimiter=',',header=None)

In [21]:
if REDUCE_FEATURES:
    X_val = X_val[selectedColumns]

In [22]:
predictions = clf.predict(X_val)
predictionProb = clf.predict_proba(X_val) # note this gives probability for each class

In [25]:
accuracy = accuracy_score(y_val, predictions)
bAccuracy = balanced_accuracy_score(y_val, predictions)
precision = precision_score(y_val, predictions)
recall = recall_score(y_val, predictions)
f1 = f1_score(y_val, predictions)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_val, predictions), (4,))
rocAuc = roc_auc_score(y_val, predictions)
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp, rocAuc)

0.9993254914863865 0.5024686625772838 0.2 0.0049504950495049506 0.00966183574879227 303719 4 201 1 0.5024686625772838


In [47]:
# Produce probabilities for the testing set
X_test = pd.read_csv(XtestFilePath, delimiter=',',header=None, names=columns)

In [48]:
if REDUCE_FEATURES:
    X_test = X_test[selectedColumns]

In [49]:
predictions = clf.predict(X_test)
predictionProb = clf.predict_proba(X_test) # note this gives probability for each class

In [50]:
#np.savetxt(r"../output/logRes_3_liblinear_l2_20190228_OUT.csv", predictionProb[:,1])