# Data preprocessing
RFE, PCA, PPCA and all of that.

In [None]:
import sys
import os
from glob import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle as pkl
import importlib
from ppca import PPCA

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV

import utils
from utils.testers import ss, assess
from utils.reporters import basic_scores, bs_rankings, bs_curves
from utils.helpers import mask50, plabels
importlib.reload(utils.reporters)
importlib.reload(utils.testers)
importlib.reload(utils.helpers)

### Feature reduction
- RFE(CV) with 50 features is the single best thing to help the periodic classifier
- For assessment see the PCA section
- Apply just RFE to the other sets to see the list of features. A bit of science.

In [None]:
## Dataset import
raw_data_path = "data/pkl/divided/"

tf = pd.read_pickle(raw_data_path+"transient_features.pkl")
tl = pd.read_pickle(raw_data_path+"transient_labeled.pkl")
sf = pd.read_pickle(raw_data_path+"stochastic_features.pkl")
sl = pd.read_pickle(raw_data_path+"stochastic_labeled.pkl")
pf = pd.read_pickle(raw_data_path+"periodic_features.pkl")
pl = pd.read_pickle(raw_data_path+"periodic_labeled.pkl")

tme = myLabelEncoder(tlabels)
sme = myLabelEncoder(slabels)
pme = myLabelEncoder(plabels)

si = SimpleImputer(strategy="constant",fill_value=-999)

Xt = si.fit_transform(tf.values)
Xs = si.fit_transform(sf.values)
Xp = si.fit_transform(pf.values)

yt = tme.fit_transform(tl["classALeRCE"].values)
ys = sme.fit_transform(sl["classALeRCE"].values)
yp = pme.fit_transform(pl["classALeRCE"].values)

In [None]:
## Compute and save RFECV transforms
compute_save_rfecv(Xt, yt, "data/pkl/rfe/rfe-transient.pkl")
compute_save_rfecv(Xs, ys, "data/pkl/rfe/rfs-stochastic.pkl")
compute_save_rfecv(Xp, yp, "data/pkl/rfe/rfe-periodic.pkl")

In [None]:
## Get feature_names_in_

In [None]:
## Compare the features selected. 
# Present in one, two or three classes
# Comment scientifically

### PCA
## Intro
- Data is full of NaNs: we have to interpolate using a constant -999 (paper) or other ideas (mean, median...)
- Probabilistic PCA iteratively tries to get a good set of principal components starting from a random one,
  interpolating the data meanwhile
- Just a better imputer or the rotation is useful? Data suggests it does not really help
- References: pca-magic, paper (...)

## PPCA transformation vs simple interpolating (ppcas1/):
- PPCA-rotating we get pretty distanced 0.92 and 0.94 roc_auc for full and rfe-reduced dataset largely independent of number components assumed during PPCA
- Just using the interpolated data but without caring about principal components the score goes up to 0.972 and 0.974 respectively, so better.
- Try different random_states, just in case, but I think this proves PPCA is just good enough to fill
- Rankings

In [None]:
# compute_save_ppca(X, d=[10, 40, 50, 100, 150, 183)
# compute_save_ppca(X50, d=[10, 40, 50)

In [None]:
# test_datasets
# scores, scores_stds = basic_scores()
# bs_rankings(scores, scores_stds)

## Minimum to maximum number of assumed principal components (2->50) (ppcas2/):
- Focusing on the rfe-reduced dataset, it's interesting that even 10 components (out of 50 features) give a really good result
- Either way the best result is obtained assuming as many underlying components as features (50)
- Curve plot

In [None]:
# compute_save_ppca(X50, d=[2,...,50])

In [None]:
# test_datasets
# scores, scores_stds = basic_scores()
# bs_curves(scores, scores_stds)

## Raw dataset vs rfe vs ppca imputing (ppcas3/):
- Proves that according to every metric but my frobenius_score the rfe-reduced set wins, but there isn't a clear winner between -999 filling and PPCA interpolation without rotation, so simplicity suggests just using constant value imputing
- Rankings

In [None]:
# compute_save_ppca(x50, d=50)
si = SimpleImputer(strategy="constant", fillvalue=-999)
X = si.fit_transform(pf.values)
X50 = si.fit_transform(pf50.values)
np.save("data/ppcas3/X/basicfull.npy",X)
np.save("data/ppcas3/X/basic50.npy",X50)

In [None]:
n_splits = 50
n_estimators = 2000
rotate = False

test_datasets("ppcas3")
scores = basic_scores("ppcas3/*.pkl")
bs_rankings(scores)
bs_rankings(scores,winners=True)

## Explained variances
- Concretely: compare explained variances of PPCA, PCA on -999 and PCA on PPCA (I know, stupid, but try), is there anything useful?
- More concretely: I want a plot of three curves of explained variance as a function of component rank.

In [None]:
# retrieve data from ppcas1
# ppca_50_50.var_exp -> get ratios
# perform PCA on -999 filled 50
# perform PCA on PPCA filled 50/50
# plot the three curves of explained variance ratio as a function of component rank