In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, confusion_matrix
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.utils.validation import column_or_1d

In [2]:
# ANY PARAMETERS?

In [3]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../output/" # And data is located in a 'data' folder in the same parent directory as 'code' folder
XtrainFilePath = os.path.join(scriptDir, relPath, "X_train_pp.csv")
ytrainFilePath = os.path.join(scriptDir, relPath, "y_train.csv")
XvalFilePath = os.path.join(scriptDir, relPath, "X_val_pp.csv")
yvalFilePath = os.path.join(scriptDir, relPath, "y_val.csv")
XtestFilePath = os.path.join(scriptDir, relPath, "X_test_pp.csv")
columnsNamesPath = os.path.join(scriptDir, relPath, "column_names.csv")

relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [4]:
columns = pd.read_csv(columnsNamesPath,header=None)
columns = list(columns[0])

In [5]:
# load features data
X_train = pd.read_csv(XtrainFilePath, delimiter=',',header=None)

In [6]:
# check expected dimensions
X_train.shape

(2430981, 547)

In [7]:
# Load labels
y_train = pd.read_csv(ytrainFilePath, delimiter=',',header=None)

In [8]:
# how many positive labels?
n = y_train[0].sum()
n
# this is how many samples we'll take from data where the label = 0
# giving a dataset containing 50% 1s, and 50% 0s

1793

In [9]:
# Downsampling
# take all the records with label = 1
X_train_1 = X_train[y_train[0] == 1]
y_train_1 = y_train[y_train[0] == 1]

In [10]:
# now get n random records with label = 0
idx = y_train[y_train[0] == 0].sample(n=n).index
X_train_0 = X_train.loc[idx]
y_train_0 = y_train.loc[idx] 

In [11]:
# combine to create training set
X_train_ds = pd.concat([X_train_0, X_train_1])
y_train_ds = pd.concat([y_train_0, y_train_1])

In [12]:
# check sizes
X_train_ds.shape

(3586, 547)

In [13]:
y_train_ds.shape

(3586, 1)

In [15]:
#####################################################
# SINGLE PASS EXAMPLE
clf = LogisticRegression(solver='liblinear').fit(X_train_ds, y_train_ds[0].ravel())

In [16]:
# save model
dump(clf, r"../output/logRes_2_liblinear_l2_ds_20190228.joblib")
# clf = load(r"../output/xxx.joblib")

['../output/logRes_2_liblinear_l2_ds_20190228.joblib']

In [17]:
clf.get_params() # check params of loaded model, confirm

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'warn',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [19]:
# Test on the validation data
X_val = pd.read_csv(XvalFilePath, delimiter=',',header=None)
y_val = pd.read_csv(yvalFilePath, delimiter=',',header=None)

In [20]:
predictions = clf.predict(X_val)
predictionProb = clf.predict_proba(X_val) # note this gives probability for each class

In [21]:
# calculate metrics
accuracy = accuracy_score(y_val, predictions)
bAccuracy = balanced_accuracy_score(y_val, predictions)
precision = precision_score(y_val, predictions)
recall = recall_score(y_val, predictions)
f1 = f1_score(y_val, predictions)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_val, predictions), (4,))
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp)

0.4849189767212306 0.5691361132438844 0.0008428688189619942 0.6534653465346535 0.0016835660991008224 147247 156476 70 132


In [22]:
# how many zeros / non-zeros do we have?
# total attributes, and non-zero count
print(clf.coef_.shape[1], np.where(clf.coef_ == 0.0, 0, 1).sum())
# TODO check coefficients against the full column list - see what is zero (may remove later)

547 499


In [23]:
#####################################################
# Repeated random down-sampling

In [24]:
repeats = 100
scores = np.zeros((repeats, 5))
predictionsProbAll = np.zeros((y_val.shape[0], repeats))
for i in range(repeats):
    # random downsample
    idx = y_train[y_train[0] == 0].sample(n=n).index
    X_train_0 = X_train.loc[idx]
    y_train_0 = y_train.loc[idx]
    X_train_ds = pd.concat([X_train_0, X_train_1])
    y_train_ds = pd.concat([y_train_0, y_train_1])
    # train, predict, score
    clf = LogisticRegression(solver='liblinear' , penalty='l2').fit(X_train_ds, y_train_ds.ravel())
#     predictions = clf.predict(X_val)
    predictionsProb = clf.predict_proba(X_val)
    predictionsProbAll[:,i] = predictionsProb[:,1] # just prob for label=1
#     accuracy = accuracy_score(y_val, predictions)
#     bAccuracy = balanced_accuracy_score(y_val, predictions)
#     precision = precision_score(y_val, predictions)
#     recall = recall_score(y_val, predictions)
#     f1 = f1_score(y_val, predictions)
#     scores[i, :] = np.array([accuracy, bAccuracy, precision, recall, f1])

In [25]:
# take average predicted probability
predictionsProbMean = np.mean(predictionsProbAll,axis=1)

In [26]:
# convert this into class predictions using standard threshold of 0.5
predictionsMean = np.where(predictionsProbMean>0.5,1,0)

In [27]:
# calculate metrics
accuracy = accuracy_score(y_val, predictionsMean)
bAccuracy = balanced_accuracy_score(y_val, predictionsMean)
precision = precision_score(y_val, predictionsMean)
recall = recall_score(y_val, predictionsMean)
f1 = f1_score(y_val, predictionsMean)
tn, fp, fn, tp = np.reshape(confusion_matrix(y_val, predictionsMean), (4,))
print(accuracy, bAccuracy, precision, recall, f1, tn, fp, fn, tp)

0.5730624331660772 0.5810803359353329 0.000916844513956839 0.5891089108910891 0.001830839647678757 174049 129674 83 119


In [28]:
# Produce probabilities for the testing set
X_test = pd.read_csv(XtestFilePath, delimiter=',',header=None)

In [29]:
predictions = clf.predict(X_test)
predictionProb = clf.predict_proba(X_test) # note this gives probability for each class

In [30]:
np.savetxt(r"../output/logRes_2_liblinear_l2_ds_20190228_OUT.csv", predictionProb[:,1])