In [2]:
import tqdm
import random
import itertools
import numpy as np
import pandas as pd
from matplotlib import cm
import matplotlib.pyplot as plt

from zeo_amd.hparams import HyperparameterOptimizer

import warnings
warnings.filterwarnings('ignore')

In [3]:
dm = pd.read_csv("../data/iza_dm.csv", index_col=0)
synth = pd.read_csv("../data/synthesis_fraction.csv", index_col=0)
synth = synth.loc[dm.index]

feat = pd.read_csv("../data/iza_features.csv", index_col=0)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

classifiers_hyperparameters = [
    (LogisticRegression, {
        'penalty': ['l2', 'none'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'sag', 'saga']
    }),
    (LogisticRegression, {
        'penalty': ['l1'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        "l1_ratio": [0.25, 0.5, 0.75, 1]
    }),
    (RandomForestClassifier, {
        'n_estimators': [10, 50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }),
    (SVC, {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [2, 3, 4, 5],
        'gamma': ['scale', 'auto']
    }),
    (xgb.XGBClassifier, {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5, 6],
        'min_child_weight': [1, 2, 3],
        'subsample': [0.5, 0.75, 1],
        'colsample_bytree': [0.5, 0.75, 1],
        "n_jobs": [4]
    })
]

## Performing the hyperparameter optimization

In [5]:
LABELS = synth.columns
MIN_SYNTHESIS = 0.25
MIN_POSITIVE = 11
VAL_SIZE = 0.2
TEST_SIZE = 0.2
N_RUNS = 10
RANDOM_SEED = 1886

In [None]:
results = []
for _label in tqdm.tqdm(LABELS):
    # Get the information for the dataset
    X = dm.values
    y = (synth[_label] > MIN_SYNTHESIS).values
    
    n_pos = y.sum()

    if n_pos < MIN_POSITIVE:
        continue
    
    for cls, ranges in classifiers_hyperparameters:
        opt = HyperparameterOptimizer(
            cls,
            ranges,
            val_size=VAL_SIZE,
            test_size=TEST_SIZE,
            balanced=True,
            random_seed=RANDOM_SEED
        )

        results += opt.optimize_hyperparameters(X, y, n_runs=10)

results_df = pd.DataFrame(results)

  0%|                                                                                                                                                                                                                         | 0/38 [00:00<?, ?it/s]