In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.utils.validation import column_or_1d

In [21]:
REDUCE_FEATURES = True

In [22]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../output/"
XtrainFilePath = os.path.join(scriptDir, relPath, "X_train_pp.csv")
ytrainFilePath = os.path.join(scriptDir, relPath, "y_train.csv")
XvalFilePath = os.path.join(scriptDir, relPath, "X_val_pp.csv")
yvalFilePath = os.path.join(scriptDir, relPath, "y_val.csv")
XtestFilePath = os.path.join(scriptDir, relPath, "X_test_pp.csv")
columnsNamesPath = os.path.join(scriptDir, relPath, "column_names.csv")
selectedColumnsNamesPath = os.path.join(scriptDir, relPath, "selectedColumns_Lasso.csv")

relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [23]:
columns = pd.read_csv(columnsNamesPath,header=None)
columns = list(columns[0])

In [24]:
# Load features
X_train = pd.read_csv(XtrainFilePath, delimiter=',',header=None, names=columns)

In [25]:
# Check dimensions as expected
X_train.shape

(2430981, 547)

In [26]:
if REDUCE_FEATURES:
    selectedColumns = pd.read_csv(selectedColumnsNamesPath,header=None)
    selectedColumns = list(selectedColumns[0])
    X_train = X_train[selectedColumns]
    print(X_train.shape)

(2430981, 232)


In [27]:
# Load labels
y_train = pd.read_csv(ytrainFilePath, delimiter=',',header=None)

In [28]:
# Train logistic regression model
clf = LogisticRegression(solver='liblinear').fit(X_train, y_train[0].ravel())

In [29]:
# SAVE MODEL
dump(clf, r"../output/logRes_3_liblinear_l2_20190228.joblib")
# clf = load(r"../output/xxxxx.joblib")

['../output/logRes_3_liblinear_l2_20190228.joblib']

In [30]:
clf.get_params() # check params of loaded model, confirm

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [31]:
# Test on the training data
predictions = clf.predict(X_train)
predictionProb = clf.predict_proba(X_train) # note this gives probability for each class

In [32]:
np.sum(predictions)

55

In [33]:
np.sum(predictionProb[:,1])

1796.3274053174553

In [34]:
# histogram this?

In [37]:
accuracy = accuracy_score(y_train, predictions)
bAccuracy = balanced_accuracy_score(y_train, predictions)
precision = precision_score(y_train, predictions)
recall = recall_score(y_train, predictions)
f1 = f1_score(y_train, predictions)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_train, predictions), (4,))
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp)

0.9992472174813378 0.5025002919940414 0.16363636363636364 0.0050195203569436695 0.009740259740259738 2429142 46 1784 9


In [None]:
#prc = precision_recall_curve(y_sample, predictionProb[:,1])

In [41]:
# Test on the validation data
X_val = pd.read_csv(XvalFilePath, delimiter=',',header=None, names=columns)
y_val = pd.read_csv(yvalFilePath, delimiter=',',header=None)

In [42]:
if REDUCE_FEATURES:
    X_val = X_val[selectedColumns]

In [43]:
predictions = clf.predict(X_val)
predictionProb = clf.predict_proba(X_val) # note this gives probability for each class

In [44]:
accuracy = accuracy_score(y_val, predictions)
bAccuracy = balanced_accuracy_score(y_val, predictions)
precision = precision_score(y_val, predictions)
recall = recall_score(y_val, predictions)
f1 = f1_score(y_val, predictions)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_val, predictions), (4,))
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


0.9993353623426833 0.5 0.0 0.0 0.0 303723 0 202 0


In [47]:
# Produce probabilities for the testing set
X_test = pd.read_csv(XtestFilePath, delimiter=',',header=None, names=columns)

In [48]:
if REDUCE_FEATURES:
    X_test = X_test[selectedColumns]

In [49]:
predictions = clf.predict(X_test)
predictionProb = clf.predict_proba(X_test) # note this gives probability for each class

In [50]:
#np.savetxt(r"../output/logRes_3_liblinear_l2_20190228_OUT.csv", predictionProb[:,1])