In [39]:
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, Lasso, LassoCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
import matplotlib.pyplot as plt
from joblib import dump, load
from sklearn.utils.validation import column_or_1d

In [2]:
# ANY PARAMETERS?

In [3]:
scriptDir = os.getcwd() # Assumes that script is executed from its actual location
relPath = r"../output/" # And data is located in a 'data' folder in the same parent directory as 'code' folder
XtrainFilePath = os.path.join(scriptDir, relPath, "X_train_pp.csv")
ytrainFilePath = os.path.join(scriptDir, relPath, "y_train.csv")
XvalFilePath = os.path.join(scriptDir, relPath, "X_val_pp.csv")
yvalFilePath = os.path.join(scriptDir, relPath, "y_val.csv")
XtestFilePath = os.path.join(scriptDir, relPath, "X_test_pp.csv")
columnsNamesPath = os.path.join(scriptDir, relPath, "column_names.csv")

relPathOutput = r"../output/"
outputFolderPath = os.path.join(scriptDir, relPathOutput)

In [4]:
columns = pd.read_csv(columnsNamesPath,header=None)
columns = list(columns[0])

In [5]:
#X_val = np.loadtxt(XvalFilePath, delimiter=',')
# check loading as pandas dataframe instead?
# it's fine, can use either in same way
#X_val = pd.read_csv(XvalFilePath, delimiter=',',header=None) # add column names if useful
#X_train = np.loadtxt(XtrainFilePath, delimiter=',') # seems to take ages
X_train = pd.read_csv(XtrainFilePath, delimiter=',',header=None)

In [6]:
X_train.shape

(2430981, 547)

In [7]:
#y_val = np.loadtxt(yvalFilePath, delimiter=',')
#y_val = pd.read_csv(yvalFilePath, delimiter=',',header=None)
#y_train = np.loadtxt(ytrainFilePath, delimiter=',')
y_train = pd.read_csv(ytrainFilePath, delimiter=',',header=None)

In [8]:
# No sampling required for this model (so far)

In [68]:
# TRAIN LOGISTIC REGRESSION MODEL
#clf = Lasso().fit(X_train, np.ravel(y_train))
# I GET MEMORY ERRORS IF TRY TO TRAIN ON FULL DATASET

In [None]:
# Ok, try repeated sampling/training instead

In [12]:
# Take all positive classes
X_train_1 = X_train[y_train[0] == 1]
y_train_1 = y_train[y_train[0] == 1]

In [84]:
n = y_train_1.shape[0]
repeats = 100
scores = np.zeros((repeats, 5))
coeffsAll = np.zeros((X_train.shape[1], repeats))
for i in range(repeats):
    # random downsample
    idx = y_train[y_train[0] == 0].sample(n=n).index
    X_train_0 = X_train.loc[idx]
    y_train_0 = y_train.loc[idx]
    X_train_ds = pd.concat([X_train_0, X_train_1])
    y_train_ds = pd.concat([y_train_0, y_train_1])
    #clf = Lasso(alpha=0.1).fit(X_train_ds, y_train_ds)
    clf = LogisticRegression(solver='liblinear', penalty='l1').fit(X_train_ds, np.ravel(y_train_ds))
    coeffsAll[:,i] = clf.coef_

In [98]:
# Count non-zero elements
nzCounts = np.count_nonzero(coeffsAll, axis=1)
#nzSum = np.sum(coeffsAll, axis=1) # could also take top n elements based on coeff size
# Save the list of columns to keep (boolean array), or named column list
keep = np.where(nzCounts > 10, True, False)
selectedColumnsLogReg = np.array(columns)[keep]

In [99]:
keep.sum()

397

In [100]:
np.savetxt(r"../output/coeffsAll_LogReg.csv", coeffsAll, fmt='%s')
np.savetxt(r"../output/selectedColumns_LogReg.csv", selectedColumnsLogReg, fmt='%s')

In [69]:
# Also try with LassoCV
# (Using Lasso, all the coeffs are zero, so use CV version)
n = y_train_1.shape[0]
repeats = 100
scores = np.zeros((repeats, 5))
coeffsAll = np.zeros((X_train.shape[1], repeats))
for i in range(repeats):
    # random downsample
    idx = y_train[y_train[0] == 0].sample(n=n).index
    X_train_0 = X_train.loc[idx]
    y_train_0 = y_train.loc[idx]
    X_train_ds = pd.concat([X_train_0, X_train_1])
    y_train_ds = pd.concat([y_train_0, y_train_1])
    clf = LassoCV(cv=3).fit(X_train_ds, np.ravel(y_train_ds))
    coeffsAll[:,i] = clf.coef_

In [81]:
# Count non-zero elements
nzCounts = np.count_nonzero(coeffsAll, axis=1)
#nzSum = np.sum(coeffsAll, axis=1) # could also take top n elements based on coeff size
# Save the list of columns to keep (boolean array), or named column list
keep = np.where(nzCounts > 5, True, False)
selectedColumnsLasso = np.array(columns)[keep]

In [82]:
keep.sum()

232

In [83]:
np.savetxt(r"../output/coeffsAll_Lasso.csv", coeffsAll, fmt='%s')
np.savetxt(r"../output/selectedColumns_Lasso.csv", selectedColumnsLasso, fmt='%s')

In [None]:
# Should I repeat the training until I have covered the full training set?
# If each iteration uses ~2000 records, it would take ~1200 slices (not random) to cover the full dataset
# For logReg, maybe need to be more stringent than taking any feature that had a non-zero coef in any run 
#  - in 10 runs this is giving about 400 features