In [1]:
import scipy.stats as ss, pandas as pd, numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, f1_score, recall_score, accuracy_score

In [2]:
N = 15
SEED = 1

class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [3]:
y_train = pd.read_csv('Y_train.csv', index_col=0)
y_valid = pd.read_csv('Y_valid.csv', index_col=0)
y = np.asarray(pd.concat([y_train, y_valid])).flatten()

X_test = pd.read_csv('X_test_cc.csv', index_col=0)
x_train = pd.read_csv('X_train_cc.csv', index_col=0)
x_valid = pd.read_csv('X_valid_cc.csv', index_col=0)
X = pd.concat([x_train, x_valid])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

In [4]:
LR_dist = DictDist({
    'C': Choice(np.geomspace(1e-3, 1e3, 10000)),
    'penalty': Choice(['l1', 'l2']),
    'solver': Choice(['lbfgs', 'saga']),
    'max_iter': Choice([500])
})
np.random.seed(SEED)
LR_hyperparams_list = LR_dist.rvs(N)
for i in range(N):
    if LR_hyperparams_list[i]['solver'] == 'lbfgs': LR_hyperparams_list[i]['penalty'] = 'l2'

In [25]:
def metrics(y_true, y_pred):
    metrics = {}
    for metric in [precision_score, recall_score, f1_score, accuracy_score]:
        metrics[f"{metric.__name__}"] = metric(y_true, y_pred)
    
    return metrics


def run_basic(model, hyperparams_list, X_train, X_valid):
    best_s, best_hyperparams = -np.Inf, None
    for i, hyperparams in enumerate(hyperparams_list):
        print("On sample %d / %d (hyperparams = %s)" % (i+1, len(hyperparams_list), repr((hyperparams))))
        pipeline = make_pipeline(SimpleImputer(), StandardScaler(), model(**hyperparams))
        pipeline.fit(X_train, y_train)

        s = roc_auc_score(y_valid, pipeline.predict_proba(X_valid)[:, 1])
        if s > best_s:
            best_s, best_hyperparams = s, hyperparams
            print("New Best Score: %.2f @ hyperparams = %s" % (100*best_s, repr((best_hyperparams))))

    return run_only_final(model, best_hyperparams, X_train, X_valid, y_train, y_valid)


def run_only_final(model, best_hyperparams, X_train, X_valid, y_train, y_valid):
    pipeline = make_pipeline(SimpleImputer(), StandardScaler(), model(**best_hyperparams))
    pipeline.fit(pd.concat((X_train, X_valid)), np.concatenate((y_train, y_valid)))
    
    res = metrics(np.concatenate((y_train, y_valid)), pipeline.predict(pd.concat([X_train, X_valid])))
    res["roc_auc_score"] = roc_auc_score(
        np.concatenate((y_train, y_valid)), 
        pipeline.predict_proba(pd.concat([X_train, X_valid]))[:, 1])

    y_score = pipeline.predict_proba(X_test)[:, 1]
    y_pred  = pipeline.predict(X_test)
    
    return y_score, y_pred, res

In [27]:
for model, params in [ 
        (LogisticRegression, LR_hyperparams_list)
]:
    (score, pred, res) = run_basic(model, params, X_train, X_valid)

On sample 1 / 15 (hyperparams = {'C': 0.001383611303681924, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})
New Best Score: 89.61 @ hyperparams = {'C': 0.001383611303681924, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500}
On sample 2 / 15 (hyperparams = {'C': 1.3047026700306064, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 500})




On sample 3 / 15 (hyperparams = {'C': 0.003491839757169992, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 500})




On sample 4 / 15 (hyperparams = {'C': 48.783036208459954, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 500})




On sample 5 / 15 (hyperparams = {'C': 0.05459762073728651, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 500})




On sample 6 / 15 (hyperparams = {'C': 1.081193410945589, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 500})




On sample 7 / 15 (hyperparams = {'C': 0.0012201371230349724, 'penalty': 'l1', 'solver': 'saga', 'max_iter': 500})
On sample 8 / 15 (hyperparams = {'C': 0.3429679403524682, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})
On sample 9 / 15 (hyperparams = {'C': 44.77804273778909, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 500})




On sample 10 / 15 (hyperparams = {'C': 0.11951096159304532, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 500})




On sample 11 / 15 (hyperparams = {'C': 433.47464177487655, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


On sample 12 / 15 (hyperparams = {'C': 1.7295128471299193, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})
On sample 13 / 15 (hyperparams = {'C': 1.6777315525707752, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 500})




On sample 14 / 15 (hyperparams = {'C': 0.059893230394471704, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})
On sample 15 / 15 (hyperparams = {'C': 0.032340816118081595, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 500})


In [36]:
for k, v in res.items():
    print(f"======= {k}: {v} =======")


print("\n")
pd.DataFrame({"score": score, "pred": pred})





Unnamed: 0,score,pred
0,0.117659,0
1,0.011290,0
2,0.021243,0
3,0.010287,0
4,0.012446,0
...,...,...
4785,0.084856,0
4786,0.022218,0
4787,0.013455,0
4788,0.034203,0
