In [4]:
import sys
import os
from glob import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
import importlib
from ppca import PPCA

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV

import utils
from utils.testers import ss, assess
from utils.reporters import basic_scores, bs_rankings, bs_curves
from utils.helpers import mask50, myLabelEncoder, tlabels, slabels, plabels
importlib.reload(utils.reporters)
importlib.reload(utils.testers)
importlib.reload(utils.helpers)

<module 'utils.helpers' from '/home/dvolo/uni/00-landoni/bigmac/utils/helpers.py'>

In [None]:
def compute_save_rfecv(X,y,filepath, step=step, verbose=verbose):
    ## BRF estimator
    clf = BalancedRandomForestClassifier(n_estimators=300,
                                    random_state=50,
                                    n_jobs=-1,
                                    sampling_strategy="auto",
                                    replacement=True,
                                    bootstrap=True)
    rfe = RFECV(estimator=clf, step=step, verbose=verbose)
    rfe.fit(X,y)
    print("Reduced to {} features".format(rfe.n_features_))
    with open(filepath,"wb") as f:
        pkl.dump(rfe, f)
    return rfe
    

## Dataset import
tf = pd.read_pickle("data/transient_features.pkl")
tl = pd.read_pickle("data/transient_labeled.pkl")
sf = pd.read_pickle("data/stochastic_features.pkl")
sl = pd.read_pickle("data/stochastic_labeled.pkl")
pf = pd.read_pickle("data/periodic_features.pkl")
pl = pd.read_pickle("data/periodic_labeled.pkl")

tme = myLabelEncoder(tlabels)
sme = myLabelEncoder(slabels)
pme = myLabelEncoder(plabels)

si = SimpleImputer(strategy="constant",fill_value=-999)

Xt = si.fit_transform(tf.values)
Xs = si.fit_transform(sf.values)
Xp = si.fit_transform(pf.values)

yt = tme.fit_transform(tl["classALeRCE"].values)
ys = sme.fit_transform(sl["classALeRCE"].values)
yp = pme.fit_transform(pl["classALeRCE"].values)

XXp = np.load("data/npy/ppcas3/X/basicfull.npy")

In [None]:
compute_save_rfecv(Xt, yt, "data/rfe/rfe-transient.pkl")
compute_save_rfecv(Xs, ys, "data/rfe/rfe-stochastic.pkl")
compute_save_rfecv(Xp, yp, "data/rfe/rfe-periodic.pkl")

In [8]:
## BRF estimator
clf = BalancedRandomForestClassifier(n_estimators=300,
                                    random_state=50,
                                    n_jobs=-1,
                                    sampling_strategy="auto",
                                    replacement=True,
                                    bootstrap=True)

In [None]:
%%time
## Recursive Feature Elimination with Cross Validation
trfe = RFECV(estimator=clf, step=1, verbose=1)
srfe = RFECV(estimator=clf, step=1, verbose=1)
prfe = RFECV(estimator=clf, step=1, verbose=1)

trfe.fit(Xt, yt)
print("Transient rfe: {} features".format(trfe.n_features_))

In [13]:
with open("data/rfe-transient.pkl","wb") as f:
    pkl.dump(trfe, f)    

In [18]:
srfe.fit(Xs, ys)
print("Stochastic rfe: {} features".format(srfe.n_features_))
with open("data/rfe-stochastic.pkl","wb") as f:
    pkl.dump(srfe, f)

Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 181 features.
Fitting estimator with 180 features.
Fitting estimator with 179 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 176 features.
Fitting estimator with 175 features.
Fitting estimator with 174 features.
Fitting estimator with 173 features.
Fitting estimator with 172 features.
Fitting estimator with 171 features.
Fitting estimator with 170 features.
Fitting estimator with 169 features.
Fitting estimator with 168 features.
Fitting estimator with 167 features.
Fitting estimator with 166 features.
Fitting estimator with 165 features.
Fitting estimator with 164 features.
Fitting estimator with 163 features.
Fitting estimator with 162 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 158 features.
Fitting estimator with 157 features.
F

In [19]:
prfe.fit(Xp, yp)
print("Periodic rfe: {} features".format(prfe.n_features_))
with open("data/prfe-periodic.pkl","wb") as f:
    pkl.dump(prfe, f)

Fitting estimator with 183 features.
Fitting estimator with 182 features.
Fitting estimator with 181 features.
Fitting estimator with 180 features.
Fitting estimator with 179 features.
Fitting estimator with 178 features.
Fitting estimator with 177 features.
Fitting estimator with 176 features.
Fitting estimator with 175 features.
Fitting estimator with 174 features.
Fitting estimator with 173 features.
Fitting estimator with 172 features.
Fitting estimator with 171 features.
Fitting estimator with 170 features.
Fitting estimator with 169 features.
Fitting estimator with 168 features.
Fitting estimator with 167 features.
Fitting estimator with 166 features.
Fitting estimator with 165 features.
Fitting estimator with 164 features.
Fitting estimator with 163 features.
Fitting estimator with 162 features.
Fitting estimator with 161 features.
Fitting estimator with 160 features.
Fitting estimator with 159 features.
Fitting estimator with 158 features.
Fitting estimator with 157 features.
F

KeyboardInterrupt: 