In [3]:
import os
from time import time

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV, HalvingRandomSearchCV

import scipy.stats as stats
from scipy.stats import uniform


In [4]:
clinical_attributes = pd.read_csv('step_04/clinical_attributes.csv')
z_score = pd.read_csv('step_04/z_score.csv')
mutation = pd.read_csv('step_04/mutation.csv')
response = pd.read_csv('step_04/response.csv')

In [5]:
y = response['overall_survival'].to_numpy()
datasets = [
    ("clinical", clinical_attributes.to_numpy()),
    ("z_score", z_score.to_numpy()),
    ("mutation", mutation.to_numpy())
]

# GridSearch vs HalvingGridSearch vs Random

In [30]:
class Models():
    def __init__(self, linspace_size: int = 3, n_iter_search: int = 15):
        self.linspace_size = 3
        self.algorithms = {
                "knn": ( 
                    KNeighborsClassifier(),
                    {
                        "n_neighbors": np.linspace(3, 100, num=linspace_size, dtype=int),
                        "weights": ["uniform", "distance"],
                        "p": np.linspace(1, 10, num=3, dtype=int),
                        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                        "leaf_size": np.linspace(10, 100, num=linspace_size, dtype=int),
                        "metric": ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
                    }
                ),
                "dt": (
                    DecisionTreeClassifier(),
                    {
                        "criterion": ["gini", "entropy", "log_loss"],
                        "splitter": ["best", "random"],
                        "max_depth": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "min_samples_split": np.linspace(2, 100, num=linspace_size, dtype=int),
                        "min_samples_leaf": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "max_features": ["auto", "sqrt", "log2"],
                        "ccp_alpha": np.linspace(0, 0.1, num=3),
                    }
                ),
                "svm": (
                    SVC(),
                    {
                        "C": np.linspace(0.1, 10, num=linspace_size),
                        "kernel": ["linear", "poly", "rbf", "sigmoid", "precomputed"],
                        "degree": np.linspace(1, 10, num=linspace_size, dtype=int),
                        "gamma": ["scale", "auto"],
                        "coef0": np.linspace(0, 10, num=linspace_size),
                        "shrinking": [True, False],
                        "probability": [True, False],
                        "tol": np.linspace(0.0001, 0.01, num=linspace_size),
                    }
                ),
                "rf": (
                    RandomForestClassifier(),
                    {
                        "n_estimators": np.linspace(10, 100, num=linspace_size, dtype=int),
                        "criterion": ["gini", "entropy", "log_loss"],
                        "max_depth": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "min_samples_split": np.linspace(2, 100, num=linspace_size, dtype=int),
                        "min_samples_leaf": np.linspace(1, 100, num=linspace_size, dtype=int),
                        "max_leaf_nodes": np.linspace(10, 90, num=9, dtype=int),
                        "min_impurity_decrease": np.linspace(0, 0.1, num=linspace_size),
                        "max_features": ["auto", "sqrt", "log2"],
                        "ccp_alpha": np.linspace(0, 0.1, num=linspace_size),
                        "bootstrap": [True, False],
                        "oob_score": [True, False],
                        "class_weight": ["balanced", "balanced_subsample"]
                    }
                ),
                "ada": (
                    AdaBoostClassifier(),
                    {
                        "n_estimators": np.linspace(10, 100, num=linspace_size, dtype=int),
                        "learning_rate": np.linspace(0.1, 1, num=linspace_size),
                        "algorithm": ["SAMME", "SAMME.R"],
                    }
                )    
        }
        self.n_iter_search = n_iter_search

    def report(self, results, n_top: int = 3):
        export_list = []
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results["rank_test_score"] == i)
            for candidate in candidates:
                export_list.append({
                    "rank": i,
                    "mean_test_score": results["mean_test_score"][candidate],
                    "std_test_score": results["std_test_score"][candidate],
                    "params": results["params"][candidate]
                })
        
        return pd.DataFrame(export_list, columns=["rank", "mean_test_score", "std_test_score", "params"])

    def random_search(self, algorithm: str, seed: int = 42):
        return RandomizedSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_iter=self.n_iter_search, random_state=seed, n_jobs=-1)
    
    def grid_search(self, algorithm: str):
        return GridSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_jobs=-1)
    
    def halving_search(self, algorithm: str):
        return HalvingGridSearchCV(self.algorithms[algorithm][0], self.algorithms[algorithm][1], n_jobs=-1)
    
    def compare_search(self, dataset: str, X: np.array, y: np.array, algorithm: str, seed: int = 42):
        export_list = []
        
        print(f"Running Grid Search with {dataset} and {algorithm}...")
        grid_search = self.grid_search(algorithm)
        
        start = time()
        grid_search.fit(X, y)
        time_spent = time() - start

        print("Grid Search took %.2f seconds" % (time_spent))

        results = grid_search.cv_results_

        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]
        
        export_list.append({
            "dataset": dataset,
            "type": "grid",
            "time": time_spent,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        print("Running Random Search...")
        random_search = self.random_search(algorithm, seed)

        start = time()
        random_search.fit(X, y)
        time_spent = time() - start

        print("Random Search took %.2f seconds" % (time_spent))

        results = random_search.cv_results_

        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]

        export_list.append({
            "dataset": dataset,
            "type": "random",
            "time": time_spent,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        print("Running Halving Grid Search...")
        halving_search = self.halving_search(algorithm)

        start = time()
        halving_search.fit(X, y)
        time_spent = time() - start

        print("Halving Grid Search took %.2f seconds" % (time_spent))

        results = halving_search.cv_results_
        
        candidate = np.flatnonzero(results["rank_test_score"] == 1)[0]

        export_list.append({
            "dataset": dataset,
            "type": "halving",
            "time": time_spent,
            "mean_test_score": results["mean_test_score"][candidate],
            "std_test_score": results["std_test_score"][candidate],
            "params": results["params"][candidate]
        })

        return pd.DataFrame(export_list, columns=["dataset", "type", "time", "mean_test_score", "std_test_score", "params"])

In [32]:
models_instance = Models(linspace_size=3, n_iter_search=15)
df = pd.DataFrame(columns=["dataset", "type", "time", "mean_test_score", "std_test_score", "params"])
for algorithm in models_instance.algorithms.keys():
    for name, data in datasets:

        tmp = models_instance.compare_search(name, data, np.squeeze(y), algorithm)

        df = pd.concat([df, tmp])

df.to_csv("step_07/compare_search.csv", index=False)
df


Running Grid Search with clinical and knn...
Grid Search took 16.50 seconds
Running Random Search...
Random Search took 0.42 seconds
Running Halving Grid Search...


Traceback (most recent call last):
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/base.py", line 764, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "/home/diego/miniconda3/envs/pandas-env/lib/python3.12/site-packages/sklearn/neighbors/_classification.py", line 271, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  

Halving Grid Search took 2.68 seconds
Running Grid Search with z_score and knn...


KeyboardInterrupt: 

# GridSearch

In [67]:
model_instance = Models(linspace_size=10, n_iter_search=15)
results = pd.DataFrame(columns=["dataset","type","rank", "mean_test_score", "std_test_score", "params"])

for algorithm in models_instance.algorithms.keys():
    for name, data in datasets:
            print(f"Running {algorithm} on {name} dataset")

            random_search = models_instance.random_search(algorithm)

            start = time()
            random_search.fit(data, np.squeeze(y))

            print(
                "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings."
                % ((time() - start), models_instance.n_iter_search)
            )

            result = models_instance.report(random_search.cv_results_)
            result.insert(0, "dataset", name)
            result.insert(1, "type", "random")

            results = pd.concat([results, result])
            
            grid_search = models_instance.grid_search(algorithm)

            start = time()

            grid_search.fit(data, np.squeeze(y))

            print(
                "GridSearchCV took %.2f seconds for %d candidate parameter settings."
                % (time() - start, len(grid_search.cv_results_["params"]))
            )

            result = models_instance.report(grid_search.cv_results_)
            result.insert(0, "dataset", name)
            result.insert(1, "type", "grid")

            results = pd.concat([results, result])
            

results.to_csv("step_07/results.csv", index=False)

Running knn on clinical dataset
RandomizedSearchCV took 0.36 seconds for 15 candidates parameter settings.


  results = pd.concat([results, result])


GridSearchCV took 0.69 seconds for 40 candidate parameter settings.
Running knn on z_score dataset
RandomizedSearchCV took 0.51 seconds for 15 candidates parameter settings.
GridSearchCV took 1.34 seconds for 40 candidate parameter settings.
Running knn on mutation dataset
RandomizedSearchCV took 0.55 seconds for 15 candidates parameter settings.
GridSearchCV took 1.41 seconds for 40 candidate parameter settings.
Running dt on clinical dataset
RandomizedSearchCV took 0.12 seconds for 15 candidates parameter settings.




GridSearchCV took 0.12 seconds for 4 candidate parameter settings.
Running dt on z_score dataset




RandomizedSearchCV took 4.64 seconds for 15 candidates parameter settings.
GridSearchCV took 4.61 seconds for 4 candidate parameter settings.
Running dt on mutation dataset




RandomizedSearchCV took 0.79 seconds for 15 candidates parameter settings.
GridSearchCV took 0.77 seconds for 4 candidate parameter settings.
Running svm on clinical dataset
RandomizedSearchCV took 3.26 seconds for 15 candidates parameter settings.
GridSearchCV took 4.30 seconds for 20 candidate parameter settings.
Running svm on z_score dataset
RandomizedSearchCV took 9.54 seconds for 15 candidates parameter settings.
GridSearchCV took 13.10 seconds for 20 candidate parameter settings.
Running svm on mutation dataset
RandomizedSearchCV took 11.18 seconds for 15 candidates parameter settings.
GridSearchCV took 14.53 seconds for 20 candidate parameter settings.
Running rf on clinical dataset




RandomizedSearchCV took 3.99 seconds for 15 candidates parameter settings.
GridSearchCV took 4.06 seconds for 10 candidate parameter settings.
Running rf on z_score dataset




RandomizedSearchCV took 36.35 seconds for 15 candidates parameter settings.
GridSearchCV took 35.35 seconds for 10 candidate parameter settings.
Running rf on mutation dataset




RandomizedSearchCV took 7.14 seconds for 15 candidates parameter settings.
GridSearchCV took 6.93 seconds for 10 candidate parameter settings.
Running ada on clinical dataset




RandomizedSearchCV took 0.81 seconds for 15 candidates parameter settings.




GridSearchCV took 0.77 seconds for 2 candidate parameter settings.
Running ada on z_score dataset




RandomizedSearchCV took 29.68 seconds for 15 candidates parameter settings.




GridSearchCV took 30.15 seconds for 2 candidate parameter settings.
Running ada on mutation dataset




RandomizedSearchCV took 1.86 seconds for 15 candidates parameter settings.




GridSearchCV took 1.82 seconds for 2 candidate parameter settings.


# Teapot
Falta ver que onda con teapot

In [59]:
!pip install tpot

Collecting tpot
  Downloading TPOT-0.12.2-py3-none-any.whl.metadata (2.0 kB)
Collecting deap>=1.2 (from tpot)
  Downloading deap-1.4.1.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting update-checker>=0.16 (from tpot)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting stopit>=1.1.1 (from tpot)
  Downloading stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting xgboost>=1.1.0 (from tpot)
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost>=1.1.0->tpot)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading TPOT-0.12.2-py3-none-any.whl (87 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m28.9 MB/s[0m e

In [4]:
from tpot import TPOTClassifier



In [5]:
seed = 42
generations = 5

In [6]:
X_clinical_train, X_clinical_test, y_train, y_test = train_test_split(clinical_attributes, response, test_size=0.2, random_state=seed)

X_mutation_train, X_mutation_test, _, _ = train_test_split(mutation, response, test_size=0.2, random_state=seed)

X_z_score_train, X_z_score_test, _, _ = train_test_split(z_score, response, test_size=0.2, random_state=seed)

In [7]:
tpot = TPOTClassifier(generations=generations, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_clinical_train, np.squeeze(y_train))
print(tpot.score(X_clinical_test, np.squeeze(y_test)))
tpot.export('step_07/tpot_clinical_pipeline.py')

  y = column_or_1d(y, warn=True)


                                                                             
Generation 1 - Current best internal CV score: 0.6825396825396826
                                                                             
Generation 2 - Current best internal CV score: 0.6899470899470899
                                                                             
Generation 3 - Current best internal CV score: 0.6899470899470899
                                                                              
Generation 4 - Current best internal CV score: 0.692063492063492
                                                                              
Generation 5 - Current best internal CV score: 0.6962962962962963
                                                                              
Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.4, min_samples_leaf=13, min_samples_split=7, n_estimators=100)
0.6835443037974683


In [9]:
tpot = TPOTClassifier(generations=generations, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_mutation_train, np.squeeze(y_train))
print(tpot.score(X_mutation_test, np.squeeze(y_test)))
tpot.export('step_07/tpot_mutation_pipeline.py')

                                                                             
                                                                             
TPOT closed during evaluation in one generation.
                                                                             
                                                                             
TPOT closed prematurely. Will use the current best pipeline.
                                                                             
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.8, min_samples_leaf=19, min_samples_split=5, n_estimators=100)
0.5991561181434599


In [10]:
tpot = TPOTClassifier(generations=generations, population_size=20, verbosity=2, random_state=42)
tpot.fit(X_z_score_train, np.squeeze(y_train))
print(tpot.score(X_z_score_test, np.squeeze(y_test)))
tpot.export('step_07/tpot_z_score_pipeline.py')

                                                                             
Generation 1 - Current best internal CV score: 0.6338624338624339
                                                                             
Generation 2 - Current best internal CV score: 0.6338624338624339
                                                                             
Generation 3 - Current best internal CV score: 0.6338624338624339
                                                                              
Generation 4 - Current best internal CV score: 0.6380952380952382
                                                                              
Generation 5 - Current best internal CV score: 0.6380952380952382
                                                                              
Best pipeline: MLPClassifier(input_matrix, alpha=0.0001, learning_rate_init=0.001)
0.620253164556962
