In [1]:
import pandas as pd
import numpy as np

# Preprocess

In [2]:
expr = pd.read_parquet('joined.parquet')

In [3]:
expr.index.name = 'biosample'

In [4]:
stress_control = pd.read_csv('../accession/stress_control.csv', index_col=0)
stress_control

Unnamed: 0_level_0,guido,maxi
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1
SAMD00025071,,
SAMD00025072,,
SAMD00025073,,
SAMD00025074,,
SAMD00025075,,
...,...,...
SAMN41829221,,
SAMN41829222,,
SAMN41829223,,
SAMN41829224,,


In [5]:
df = expr.join(stress_control, how='left')

In [6]:
df

Unnamed: 0_level_0,Glyma.01G000100,Glyma.01G000137,Glyma.01G000174,Glyma.01G000211,Glyma.01G000248,Glyma.01G000285,Glyma.01G000322,Glyma.01G000359,Glyma.01G000400,Glyma.01G000600,...,Glyma.U044800,Glyma.U045000,Glyma.U045100,Glyma.U045402,Glyma.U045502,Glyma.U045602,Glyma.U045702,Glyma.U045802,guido,maxi
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMN00265078,5.453108,59.054636,1.408092,1.218809,0.000000,2.651369,2.801494,0.377480,0.523319,2.993846,...,0.0,0.0,0.0,4.317951,0.421438,11.072744,0.000000,0.053070,,
SAMN00265079,6.500151,56.896441,6.155613,2.903028,14.714201,12.752819,4.904175,6.120868,7.059885,6.651526,...,0.0,0.0,0.0,4.313916,0.000000,8.977273,0.518904,1.336911,,True
SAMN00265080,3.103596,0.000000,4.038168,0.000000,2.434158,7.362351,1.693301,4.158352,9.958392,9.768484,...,0.0,0.0,0.0,0.723192,0.000000,6.653363,0.000000,2.470588,,True
SAMN00265081,4.226230,0.000000,4.021155,0.441538,2.820968,4.016449,1.627095,4.222031,7.327622,10.667464,...,0.0,0.0,0.0,0.931953,0.297342,0.000000,0.390688,0.777894,,True
SAMN00265082,2.582089,0.000000,1.710374,0.980639,5.646594,3.689870,1.837848,8.406095,5.689184,4.331051,...,0.0,0.0,0.0,0.495017,0.000000,2.276112,0.000000,0.595369,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAMN09295126,6.011819,0.000000,2.234152,0.000000,0.000000,2.596353,1.998173,0.000000,9.105045,4.742704,...,0.0,0.0,0.0,8.817960,0.000000,32.194774,0.000000,0.000000,,
SAMN09295127,5.552961,21.092150,0.000000,0.000000,0.000000,0.000000,0.700031,1.792143,8.356985,3.354179,...,0.0,0.0,0.0,5.273037,0.000000,0.000000,0.000000,0.604034,,
SAMN09295128,5.692426,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,13.120328,4.756147,3.019845,...,0.0,0.0,0.0,1.545678,0.000000,0.000000,0.000000,0.000000,,
SAMN09295129,3.511474,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.860411,1.673083,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,,


In [7]:
X = df.drop(['guido', 'maxi'], axis=1).to_numpy()
y = df['guido'].to_numpy()

In [9]:
U, S, Vh = np.linalg.svd(X)

# Modelo

In [None]:
from sklearn import neighbors, tree, linear_model, ensemble
from sklearn import model_selection
import matplotlib.pyplot as plt

In [9]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df[df['guido'].notna()].drop(['guido', 'maxi'], axis=1), df.loc[df['guido'].notna(), 'guido'])

In [13]:
tree_params = {
    'max_depth': np.arange(1, 50, 1),
    'criterion': ['absolute_error', 'poisson', 'friedman_mse', 'squared_error'],
    'max_features': ['sqrt', 'log2']
}

knn_params = {
    'n_neighbors': np.arange(1, 200, 1),
    'metric': ['l1', 'l2', 'cosine']
}

svm_params = {
    'C': np.linspace(1, 10, 100),
    'tol': np.logspace(-1, -6, 100),
    'kernel': ['linear', 'rbf']
}

elastic_params  = {
    'alpha': np.logspace(-5, 5, 50),
    'l1_ratio': np.linspace(0, 1, 50),
    'fit_intercept': [False]
}

forest_params = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': np.arange(20, 50, 10),
    'min_samples_split': np.arange(2, 15, 4),
    'min_samples_leaf': np.arange(2, 15, 4),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}
gradient_params = {
    'n_estimators': np.arange(1, 1000, 10),
    'learning_rate': np.logspace(-5, 5, 100),
    'max_depth': np.arange(1, 20, 1),
    'min_samples_split': np.arange(2, 20, 1),
    'min_samples_leaf': np.arange(2, 20, 1),
    'subsample': np.linspace(0, 10, 10),
    'max_features': ['sqrt', 'log2']
}

In [19]:
modelos = {
    # tree.DecisionTreeRegressor: tree_params,
    neighbors.KNeighborsRegressor: knn_params,
    # linear_model.ElasticNet: elastic_params,
    ensemble.RandomForestRegressor: forest_params,
    # ensemble.GradientBoostingRegressor: gradient_params
    # svm.SVR: svm_params, # Tarda mucho (usar gpu + multithread)
}

results = {}
mejores = []

for modelo, params in modelos.items():
    print(modelo)
    search = model_selection.RandomizedSearchCV(estimator=modelo(),
                                       param_distributions=params,
                                       scoring='neg_mean_absolute_error',
                                       n_iter=100,
                                       n_jobs=-1,
                                       verbose=1)
    search.fit(X_train, y_train)
    results[modelo] = search.cv_results_
    mejores.append(search.best_estimator_)

<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(1, len(mejores), figsize=(20,4), sharey='row')
for i, modelo in enumerate(mejores):
    model_selection.LearningCurveDisplay.from_estimator(
        modelo, X_train, y_train, std_display_style=None, train_sizes = np.linspace(0.00001, 1, 20), ax=ax[i]
    )
    ax[i].set_title(modelo.__class__.__name__)
fig.suptitle('Curvas de aprendizaje')

plt.savefig('curvas_aprendizaje.png')