In [None]:
import pandas as pd
import numpy as np

# Preprocesamiento

In [None]:
venancio = pd.read_parquet('../expresion/venancio.parquet')
venancio

In [None]:
condicion = pd.read_csv('../estres/condicion.csv', index_col=0)
condicion

In [None]:
joined = venancio.join(condicion)
joined = joined[joined['condicion'].notna()]
joined

In [None]:
X = joined.drop('condicion', axis=1)
y = joined.condicion

# Modelado

In [None]:
from sklearn import model_selection, ensemble, manifold
import matplotlib.pyplot as plt

In [None]:
y = y == 'control'
y

In [None]:
tsne = manifold.TSNE()
X_tsne = tsne.fit_transform(X)

In [None]:
plt.scatter(X_tsne[y, 0], X_tsne[y, 1], alpha=0.5, label='control')
plt.scatter(X_tsne[~y, 0], X_tsne[~y, 1], alpha=0.5, label='estrés')
plt.legend()
plt.title('t-SNE expresión genes')

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

In [None]:
random_forest = ensemble.RandomForestClassifier()
params = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': np.arange(20, 50, 10),
    'min_samples_split': np.arange(2, 15, 4),
    'min_samples_leaf': np.arange(2, 15, 4),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

In [None]:
search = model_selection.RandomizedSearchCV(estimator=random_forest, param_distributions=params, n_jobs=-1)
search.fit(X_train, y_train)

In [None]:
search.cv_results_['mean_test_score'].mean()

In [None]:
estimator = search.best_estimator_
importances = np.flip(np.sort(estimator.feature_importances_))
importances

In [None]:
plt.plot(importances)
plt.title('Importancias de features ordenadas (Random Forest)')

In [None]:
X.columns[estimator.feature_importances_ > 0]

In [None]:
X_tsne = tsne.fit_transform(X.loc[:, estimator.feature_importances_ > 0.001])

In [None]:
model_selection.LearningCurveDisplay.from_estimator(estimator, X_train, y_train)

In [None]:
plt.scatter(X_tsne[y, 0], X_tsne[y, 1], alpha=0.5, label='control')
plt.scatter(X_tsne[~y, 0], X_tsne[~y, 1], alpha=0.5, label='estrés')
plt.legend()
plt.title('t-SNE expresión genes seleccionados')

In [None]:
gbr = ensemble.GradientBoostingClassifier()
params = {
    "loss": ['log_loss', 'exponential'],
    "learning_rate": np.arange(0, 10, 1),
    "n_estimators": np.arange(1, 200, 1),
    "max_depth": np.arange(1, 50, 1),
    "max_features": ['sqrt', 'log2']
}

In [None]:
search = model_selection.RandomizedSearchCV(gbr, param_distributions=params, n_jobs=-1)
search.fit(X_train, y_train)

In [None]:
search.cv_results_['mean_test_score'].mean()

In [None]:
model_selection.LearningCurveDisplay.from_estimator(gbr, X_train, y_train)