# Logistic regression model with CV and lasso regularisation 

**Questions :**
- What number of iterations to ensure convergence ?
- What lasso regularisation parameter ?
- What cross-validation method ?
- Validation/in/out sample sizes influence


**ISSUES :**
- missing values in the eeg : drop or try to replace ?
- convergence of the model is very slow
- model performance (whether accuracy, MSE or Kullback-Liebler divergence)

**Notes :**
- no significant class imbalance 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

## Preamble

In [None]:
from source.preamble import *

train_eeg_names = os.listdir(Dir.eeg_train)
train_spc_names = os.listdir(Dir.spc_train)
len(train_eeg_names), len(train_spc_names)

## Functions definition

In [None]:
from source.classes import Eeg, ChainBuilder, EegChain, FeatureGenerator
from source.scoring import score
from source.train_algos import (
    train_logistic_regression_CV,
    test_model,
    predict_probas_test_set,
)
from source.process import (
    open_train_metadata,
    print_summary_metadata,
    convert_parquet_to_npy,
)
from source.pre_train import extract_validation_set
from source.plotting import plot_coefs, plot_distributions

## Process metadata

In [None]:
meta_all = open_train_metadata(read=False)
print_summary_metadata(meta_all)
meta_all.iloc[:10]

#### Convert parquet to npy

In [None]:
run_npy_conversion = False
if run_npy_conversion:
    convert_parquet_to_npy(Dir.eeg_train, Dir.eeg_train, meta_all["eeg_id"].unique())
    run_npy_conversion = False

## Sample class usage
For Eeg class :

In [None]:
eeg = Eeg(Dir.eeg_train, meta_all.iloc[1])
print("EEG sub id : ", eeg.eeg_sub_id)
display(eeg.open())  # the whole EEG
display(eeg.open_subs())  # only the selected subsample
eeg.plot(columns=["Fp1", "T6"])

For FeatureGenerator and EegChain : 
- cascading methods
- reusable object

In [None]:
def eeg_chain_train(sample):
    return (
        EegChain()
        .open(Eeg(Dir.eeg_train, sample))
        ._fillna()
        ._divide(coef=10000.0)
        .mean(cols=["Fp1", "EKG"])
        .var(cols=["EKG"])
        .signature(
            cols=["Fp1", "EKG", "F7", "T3", "O2"],
            depth=3,
            index=[i for i in range(ChainBuilder.n_sig_coordinates(5, 3))],
            time_augment=True,
        )
        .result()
    )


feature_generator = FeatureGenerator(eeg_chain=eeg_chain_train)

meta_sample = meta_all.iloc[:100]

features = feature_generator.process(metadata=meta_sample, save=False)
features

## Train
- make sure not to generate class imbalance
- seed should be fixed in local for debug

In [None]:
# split validation
rest_meta, validation_meta = extract_validation_set(meta_all, ratio=0.05)
print_summary_metadata(rest_meta)
print_summary_metadata(validation_meta)

In [None]:
# parameters
max_nsample = 100 if KAGGLE else 10000
max_it = 20000
cs = 0.08  # 0.08 for only a few parameters

sig_cols = ["Fp1", "EKG"]
pre_norm_coef = 10e4
depth = 3


def eeg_chain_train(sample):
    return (
        EegChain()
        .open_npy(Eeg(Dir.eeg_train, sample))
        # ._center()
        # ._fillna()
        .mean_npy(cols=eeg_name_to_idx(EEG_COLS))
        .var_npy(cols=eeg_name_to_idx(EEG_COLS))
        #     ._divide(coef=pre_norm_coef)
        #     .signature_npy(
        #         cols=eeg_name_to_idx(sig_cols),
        #         depth=depth,
        #         time_augment=True,
        #         factorial_rescale=True
        # )
        .result()
    )


feature_generator = FeatureGenerator(
    eeg_chain=eeg_chain_train,
    # save=os.path.join(Dir.intermediate_output, "eeg_features_train.parquet"),
)

model, *other = train_logistic_regression_CV(
    rest_meta,
    feature_generator,
    VOTE_COLS,
    max_it=max_it,
    max_nsample=max_nsample,
    grade=Grade.bad,
    scale=True,
    Cs=[cs],
    fit_intercept=False,
)

In [None]:
# keep the same preprocessing for the test and validation data
scaler = other[0]

In [None]:
plot_coefs(model)

### Tune lasso regularisation parameter

- with Logistic regression **10 fold CV** and **Z-score scaling** 
- 1000 train samples
- max solver iteration = 1000 (saga)
- **CONVERGENCE ISSUES** (max iteration reached)

In [None]:
if not KAGGLE:
    target_probas = validation_meta[VOTE_COLS]

In [None]:
# train and optimisation params
max_nsample = 1000 if KAGGLE else 1000
max_it = 1000

# penalisation
csx = np.linspace(0.1, 1, 2, endpoint=True)


# feature spacce
def eeg_chain_train(sample):
    return (
        EegChain()
        .open_npy(Eeg(Dir.eeg_train, sample))
        # ._center()
        # ._fillna()
        .mean_npy(cols=eeg_name_to_idx(EEG_COLS))
        .var_npy(cols=eeg_name_to_idx(EEG_COLS))
        #     ._divide(coef=pre_norm_coef)
        #     .signature_npy(
        #         cols=eeg_name_to_idx(sig_cols),
        #         depth=depth,
        #         time_augment=True,
        #         factorial_rescale=True
        # )
        .result()
    )


feature_generator = FeatureGenerator(
    eeg_chain=eeg_chain_train,
)


for cs in csx:
    print("=" * 100)
    print("\tCs=", cs)
    model, *other = train_logistic_regression_CV(
        rest_meta,
        feature_generator,
        VOTE_COLS,
        max_it=max_it,
        max_nsample=max_nsample,
        grade=Grade.certain,
        scale=True,
        Cs=[cs],
    )
    scaler = other[0]
    predicted_probas = test_model(model, feature_generator, VOTE_COLS, validation_meta, scaler)
    fig, _ = plot_coefs(model)
    display(fig)

## Test
Not yet defined how to deal with NA values in the signals 
=> they are skipped for now

Also need to fix the computation of the subsample length (at each change of file)
=> those are skipped as well

"clean" => keep only data that has no issue
In the future should not drop any test values

In [None]:
# in case we want to get rid of outliars ?
validation_meta_clean = validation_meta

In [None]:
if not KAGGLE:
    feature_generator = FeatureGenerator(
        eeg_chain=eeg_chain_train,
        # save=os.path.join(Dir.intermediate_output, "eeg_features_test.parquet"),
    )
    predicted_probas = test_model(
        model, feature_generator, VOTE_COLS, validation_meta_clean, scaler
    )

In [None]:
# UNIFORM PREDICTED PROBAS
# most basic benchmark
pp = [1.0 / 6] * 6
predicted_probas_uniform = np.array([pp for _ in range(len(validation_meta_clean))])

**Note** : 
the uniform benchmark is not even outperformed.

Most likely because the data is very noisy.

In [None]:
if not KAGGLE:
    target_probas = validation_meta_clean[VOTE_COLS]

In [None]:
max_proba_predict = (
    predicted_probas == np.repeat(np.max(predicted_probas, axis=1), 6).reshape((-1, 6))
).astype(float)

##### Test score

In [None]:
score(predicted_probas_uniform, target_probas.values)

In [None]:
score(predicted_probas, target_probas.values)

> confusion matrix to visualise which classes are better classified

In [None]:
score(max_proba_predict, target_probas.values)

In [None]:
plot_distributions(predicted_probas, target_probas.values)

In [None]:
plot_distributions(max_proba_predict, target_probas.values)

> The linear regression sucks : it didn't budge from uniform distribution

## Conclusion

- Regularisation is usefull in our case as many features are redundant and many of them don't have predictive power.
- It it still very inefficient : it is very similar to the uniform predictor

This indicates that even in the time-augmented feature-space, the classes are not linearly independant.

It could be interesting not to perform the time-augmentation so that the scaling of patterns would be matched.

## Submission

In [None]:
# the true test samples
meta_test = pd.read_csv(os.path.join(Dir.root, "test.csv"))

In [None]:
convert_parquet_to_npy(Dir.eeg_test, Dir.eeg_test, meta_test["eeg_id"].unique())

In [None]:
# recreate the objects for the submission folder
def eeg_chain_test(sample):
    return (
        EegChain()
        .open_npy(Eeg(Dir.eeg_test, sample), subsample=False)
        # ._center()
        # ._fillna()
        .mean_npy(cols=eeg_name_to_idx(EEG_COLS))
        .var_npy(cols=eeg_name_to_idx(EEG_COLS))
        #     ._divide(coef=pre_norm_coef)
        #     .signature_npy(
        #         cols=eeg_name_to_idx(sig_cols),
        #         depth=depth,
        #         time_augment=True,
        #         factorial_rescale=True
        # )
        .result()
    )


feature_generator_test = FeatureGenerator(
    eeg_chain=eeg_chain_test,
)

In [None]:
sub = predict_probas_test_set(model, meta_test, feature_generator_test)
sub.to_csv(os.path.join(Dir.out, "submission.csv"))
sub

In [None]:
sub.iloc[0] = (sub.values == np.repeat(np.max(sub.values), len(VOTE_COLS))).astype(float)
sub