# Crossval-POC

We can't do *pure* crossval with entire rows because when we fit the new row to the model we're observing the data in a way.
But we can probably still do model selection by when the improvement in fit diverges from the improvement in crossval.

There's also an example [here](https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html#sphx-glr-auto-examples-decomposition-plot-pca-vs-fa-model-selection-py) I'm interested in.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

# Local files
import factor
import selection
import vis
import data

# Load models
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [None]:
X = data.load("helm")  # helm, p70m or synthetic

# Or perhaps there are small predictive features that are getting clobbered by large spurious features
# (poor signal to noise ratio)

print(f"{X.shape[0]} models, {X.shape[1]} features")

In [None]:
# Now let's do row-wise cross validation for model selection

Z = StandardScaler().fit_transform(X)

n_components = np.arange(1, 12)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
indices = np.arange(X.shape[0])


# scores = cross_val_score(LogisticRegression(), X, y, cv=5, scoring='f1')
#  for regressors, default is negative MSE (so higher is better?)

RMSEs = []
FULLs = []
for n in n_components:
    
    pca = PCA(n_components=n)
    mse = []

    Q = pca.fit_transform(Z)
    R = pca.inverse_transform(Q)
    _score = ((Z-R)**2).ravel().mean() ** .5
    FULLs.append(_score)

    # The default scorer is log likelihood under probablistic PCA.. hmm
    # Not the reconstruction score (I'm thinking of this like a regression)
    #cv = (-cross_val_score(pca, X).mean(axis=0)) ** .5
    # RMSEs.append(cv)
    scores = []
    for train_ix, test_ix in kf.split(indices):
        # get HOLDOUT error here
        Q = pca.fit_transform(Z[train_ix])
        target = Z[test_ix]
        S = pca.transform(target)
        R = pca.inverse_transform(S)
        _score = ((target-R)**2).ravel().mean() ** .5
        scores.append(_score)
    RMSEs.append(np.mean(scores))

al_MSEs, al_std, fit_err = selection.cross_validate(Z, factor.PCA(), n_components[-1], n_folds=10, repeats=2)
al_MSEs = al_MSEs ** .5
fit_err = np.array(fit_err) ** .5


plt.figure
plt.plot(n_components, RMSEs, label="Row holdout (scikit-learn)")
plt.plot(n_components, FULLs, label="No holdout (scikit-learn)")
plt.plot(n_components, al_MSEs, label="Partial holdout (inhouse)")
plt.legend()
plt.show()

# Thoughts:
* You can see the elbow in all (even non-holdout) - so we'd *probably* guess the right dimensions
* Only my holdout method definitively shows overfitting
* I believe some methods respond better to row holdout, eg factor analysis