# Protein Exploration Notebook

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import average_precision_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


# Load Datasets

In [2]:
#Load Datasets

protein_df = pd.read_csv('../data/raw/ProteinAndPathologyQuantifications.csv')
Donor_df = pd.read_csv('../data/raw/DonorInformation.csv')
Stains_df = pd.read_csv('../data/raw/DescriptionOfStains.csv')
gene_expression_df = pd.read_csv('../data/raw/gene_expression_matrix_2016-03-03/fpkm_table_normalized.csv')
gene_expression_unnormalized_df = pd.read_csv('../data/raw/gene_expression_matrix_2016-03-03/fpkm_table_unnormalized.csv')
columns_samples_df = pd.read_csv('../data/raw/gene_expression_matrix_2016-03-03/columns-samples.csv')
rows_genes_df = pd.read_csv('../data/raw/gene_expression_matrix_2016-03-03/rows-genes.csv')


In [3]:
# Create df of donors ids and corresponding diagnosis, this will be used to map to the proteins data samples
donor_labels_df = Donor_df[['donor_id','act_demented']]

In [4]:
donor_labels_df

Unnamed: 0,donor_id,act_demented
0,326765665,No Dementia
1,326765656,No Dementia
2,326765654,Dementia
3,467056391,No Dementia
4,309335447,Dementia
...,...,...
102,309335458,Dementia
103,309335462,Dementia
104,309335454,No Dementia
105,309335486,No Dementia


In [5]:
# Map donor_labels_df to protein_df so each sample will be associated with the appropriate diagnosis 
donor_labels_dict = donor_labels_df.set_index('donor_id')['act_demented'].to_dict()
protein_df['dementia_status'] = protein_df['donor_id'].map(donor_labels_dict)

In [6]:
#Examine protein_df to ensure labels properly populated
protein_df

Unnamed: 0,donor_id,donor_name,structure_id,structure_acronym,ihc_a_syn,ihc_tau2_ffpe,ihc_at8_ffpe,ihc_at8,ihc_ptdp_43_ffpe,ihc_a_beta_ffpe,...,rantes_pg_per_mg,ab40_pg_per_mg,a_syn_pg_per_mg,ifn_g_pg_per_mg,mcp_1_pg_per_mg,bdnf_pg_per_mg,mip_1a_pg_per_mg,il_7_pg_per_mg,ab42_pg_per_mg,dementia_status
0,309335467,H14.09.030,10557,FWM,0.000078,0.002358,0.001137,0.000110,0.001259,0.008335,...,15.99,0.736100,0.122288,1.47,20.78,5.03736,9.38,11.78,523.292251,No Dementia
1,309335480,H14.09.043,10208,PCx,0.000063,0.002762,0.001272,0.000164,0.002354,0.005047,...,24.04,0.669094,0.111962,1.44,40.40,4.95462,8.10,45.02,81.493875,Dementia
2,309335493,H14.09.056,10557,FWM,0.000064,0.003468,0.013787,0.016023,0.001708,0.007365,...,129.80,0.736100,0.091084,0.54,46.88,5.88409,27.00,15.82,470.734514,Dementia
3,326765668,H14.09.081,10557,FWM,0.000049,0.003035,0.001707,0.000137,0.001729,0.004046,...,15.16,8.995575,0.054076,0.76,10.34,16.13524,8.06,24.22,568.368571,No Dementia
4,326765668,H14.09.081,10235,TCx,0.000080,0.002088,0.004489,0.000062,0.001513,0.015809,...,17.90,95.931000,0.030000,1.22,21.38,0.03000,10.52,23.72,438.863263,No Dementia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,326765653,H14.09.066,10208,PCx,0.000046,0.001989,0.000533,0.000082,0.001130,0.003128,...,,,,,,,,,,Dementia
373,309335452,H14.09.015,10294,HIP,0.000089,0.001918,0.004685,0.000923,0.001181,0.003483,...,5.94,0.574500,0.248816,0.00,7.16,2.94819,17.82,1.38,0.054413,No Dementia
374,309335489,H14.09.052,10294,HIP,0.000066,0.002275,0.006337,0.009560,0.001395,0.001710,...,7.62,0.900050,0.183510,0.00,3.14,10.56419,21.96,0.60,0.181375,No Dementia
375,309335458,H14.09.021,10208,PCx,0.000793,0.004384,0.001266,0.000093,0.002505,0.009795,...,4.88,0.444250,0.045202,0.00,7.20,2.83064,0.00,12.46,205.886650,Dementia


In [7]:
#Check the distribution of Dementia in Protein samples dataset
print('No Dementia: ', len(protein_df[protein_df['dementia_status']=='No Dementia']))
print('Dementia: ', len(protein_df[protein_df['dementia_status']=='Dementia']))

No Dementia:  197
Dementia:  180


# Handling Missing Data

In [8]:
#check for Nan
total_nan_count = protein_df.isna().sum()


In [9]:
#total_nan_count

In [10]:
# Process dataframe by Dropping Nan
# - drop isoprostane_pg_per_mg because missing most of the data (63% of samples missing isoprostane data)
# - dropna for remaining columns (will still represent >70% of the samples)

protein_df_drop_cleaned = protein_df.drop('isoprostane_pg_per_mg',axis=1)
protein_df_drop_cleaned.dropna(inplace = True)

In [11]:
# Check for Nan after cleaning
total_nan_count = protein_df_drop_cleaned.isna().sum()
#total_nan_count

In [12]:
# protein_df_drop_cleaned has samples with corresponding dementia labels and no missing values. 
# Represents total of 279 samples from original 377 samples (74%)
# Dropping missing values was the strategy choosen since biological systems are complex and variability between
# individuals could be significant. Filling missing values through imputation could potentially skew the data.

In [13]:
# Protein groups
# There are 28 different protein measures, however there are only 14 distinct proteins. 
# Some measures are different methods of measuring the same protein, for example ihc_a_syn is measuring the 
# stain area coverage for α-synuclein, while a_syn_pg_per_mg is measuring the concentration of a_syn_pg_per_mg

# Amyloid: ihc_a_beta, ab42_over_ab40_ratio, ihc_a_beta_ffpe, ab42_pg_per_mg, ab40_pg_per_mg
# α-synuclein:ihc_a_syn, a_syn_pg_per_mg
# interferon: ifn_g_pg_per_mg
# RANTES: rantes_pg_per_mg
# Tau: ihc_tau2_ffpe, ptau_over_tau_ratio, ptau_ng_per_mg, ihc_at8, ihc_at8_ffpe, tau_ng_per_mg
# Isoprostane: isoprostane_pg_per_mg
# BDNF: bdnf_pg_per_mg
# MIP-1a: mip_1a_pg_per_mg
# Interleukins: il_4_pg_per_mg, il_1b_pg_per_mg, il_7_pg_per_mg, il_6_pg_per_mg, il_10_pg_per_mg
# MCP-1: mcp_1_pg_per_mg
# IBA-1: ihc_iba1_ffpe
# phospho-TDP43: ihc_ptdp_43_ffpe
# GFAP:ihc_gfap_ffpe
# TNF: Tumor Necrosis Factor alpha

In [14]:
#View list of columns
columns = protein_df_drop_cleaned.columns
columns

Index(['donor_id', 'donor_name', 'structure_id', 'structure_acronym',
       'ihc_a_syn', 'ihc_tau2_ffpe', 'ihc_at8_ffpe', 'ihc_at8',
       'ihc_ptdp_43_ffpe', 'ihc_a_beta_ffpe', 'ihc_a_beta', 'ihc_iba1_ffpe',
       'ihc_gfap_ffpe', 'ptau_ng_per_mg', 'vegf_pg_per_mg',
       'ab42_over_ab40_ratio', 'tnf_a_pg_per_mg', 'tau_ng_per_mg',
       'il_10_pg_per_mg', 'il_6_pg_per_mg', 'il_1b_pg_per_mg',
       'ptau_over_tau_ratio', 'il_4_pg_per_mg', 'rantes_pg_per_mg',
       'ab40_pg_per_mg', 'a_syn_pg_per_mg', 'ifn_g_pg_per_mg',
       'mcp_1_pg_per_mg', 'bdnf_pg_per_mg', 'mip_1a_pg_per_mg',
       'il_7_pg_per_mg', 'ab42_pg_per_mg', 'dementia_status'],
      dtype='object')

In [15]:
# columns to keep for training
# dropping catergorical columns as well as donor_id/donor_name and dementia status

columns_keep_for_training = [
       'ihc_a_syn', 'ihc_tau2_ffpe', 'ihc_at8_ffpe', 'ihc_at8',
       'ihc_ptdp_43_ffpe', 'ihc_a_beta_ffpe', 'ihc_a_beta', 'ihc_iba1_ffpe',
       'ihc_gfap_ffpe', 'ptau_ng_per_mg', 'vegf_pg_per_mg',
       'ab42_over_ab40_ratio', 'tnf_a_pg_per_mg', 'tau_ng_per_mg',
       'il_10_pg_per_mg', 'il_6_pg_per_mg', 'il_1b_pg_per_mg',
       'ptau_over_tau_ratio', 'il_4_pg_per_mg', 'rantes_pg_per_mg',
       'ab40_pg_per_mg', 'a_syn_pg_per_mg', 'ifn_g_pg_per_mg',
       'mcp_1_pg_per_mg', 'bdnf_pg_per_mg', 'mip_1a_pg_per_mg',
       'il_7_pg_per_mg', 'ab42_pg_per_mg']

In [16]:
protein_df_drop_cleaned

Unnamed: 0,donor_id,donor_name,structure_id,structure_acronym,ihc_a_syn,ihc_tau2_ffpe,ihc_at8_ffpe,ihc_at8,ihc_ptdp_43_ffpe,ihc_a_beta_ffpe,...,rantes_pg_per_mg,ab40_pg_per_mg,a_syn_pg_per_mg,ifn_g_pg_per_mg,mcp_1_pg_per_mg,bdnf_pg_per_mg,mip_1a_pg_per_mg,il_7_pg_per_mg,ab42_pg_per_mg,dementia_status
0,309335467,H14.09.030,10557,FWM,0.000078,0.002358,0.001137,0.000110,0.001259,0.008335,...,15.99,0.736100,0.122288,1.47,20.78,5.03736,9.38,11.78,523.292251,No Dementia
1,309335480,H14.09.043,10208,PCx,0.000063,0.002762,0.001272,0.000164,0.002354,0.005047,...,24.04,0.669094,0.111962,1.44,40.40,4.95462,8.10,45.02,81.493875,Dementia
2,309335493,H14.09.056,10557,FWM,0.000064,0.003468,0.013787,0.016023,0.001708,0.007365,...,129.80,0.736100,0.091084,0.54,46.88,5.88409,27.00,15.82,470.734514,Dementia
3,326765668,H14.09.081,10557,FWM,0.000049,0.003035,0.001707,0.000137,0.001729,0.004046,...,15.16,8.995575,0.054076,0.76,10.34,16.13524,8.06,24.22,568.368571,No Dementia
4,326765668,H14.09.081,10235,TCx,0.000080,0.002088,0.004489,0.000062,0.001513,0.015809,...,17.90,95.931000,0.030000,1.22,21.38,0.03000,10.52,23.72,438.863263,No Dementia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,326765668,H14.09.081,10294,HIP,0.000122,0.006376,0.026056,0.025917,0.001646,0.009597,...,4.56,5.170612,0.115779,0.00,3.50,11.53213,19.72,1.38,457.402238,No Dementia
373,309335452,H14.09.015,10294,HIP,0.000089,0.001918,0.004685,0.000923,0.001181,0.003483,...,5.94,0.574500,0.248816,0.00,7.16,2.94819,17.82,1.38,0.054413,No Dementia
374,309335489,H14.09.052,10294,HIP,0.000066,0.002275,0.006337,0.009560,0.001395,0.001710,...,7.62,0.900050,0.183510,0.00,3.14,10.56419,21.96,0.60,0.181375,No Dementia
375,309335458,H14.09.021,10208,PCx,0.000793,0.004384,0.001266,0.000093,0.002505,0.009795,...,4.88,0.444250,0.045202,0.00,7.20,2.83064,0.00,12.46,205.886650,Dementia


# Custom Functions

In [17]:
# Adapted from functions created by Camaron Mangham

In [18]:
def train_models(models, model_names, X_train, y_train, X_val, y_val):
    score_names = ['accuracy',
                   'precision',
                   'recall',
                   'f1',
                    #   'roc_auc'
                      ]
    scores = []
    for name, model in zip(model_names, models):
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        accuracy_scores = accuracy_score(y_val, preds)
        precision_scores = precision_score(y_val, preds)
        recall_scores = recall_score(y_val, preds)
        f1_scores = f1_score(y_val,preds)
        # roc_auc_scores = roc_auc_score(y_val,preds)

        scores.append([accuracy_scores,
                       precision_scores,
                       recall_scores,
                       f1_scores,
                        # roc_auc_scores
                        ])

    model_scores = pd.DataFrame(scores, index= model_names, columns= score_names)

    return model_scores

In [19]:
def train_test_val_split(protein_df,donor_ids):
    # 70, 15, 15 Train, Validate, Test split

    train_ids, test_ids = train_test_split(donor_ids, test_size=0.33, random_state= 42)
    train_ids, validate_ids = train_test_split(train_ids, test_size=0.33, random_state= 42)

    # Now can filter by train, val, test, split
    train_df = protein_df[protein_df['donor_id'].isin(train_ids)]
    val_df = protein_df[protein_df['donor_id'].isin(validate_ids)]
    test_df = protein_df[protein_df['donor_id'].isin(test_ids)]

    # final data prep
    X_train = train_df.drop(columns='dementia_status')
    y_train = train_df['dementia_status']

    X_val = val_df.drop(columns='dementia_status')
    y_val = val_df['dementia_status']

    X_test = test_df.drop(columns='dementia_status')
    y_test = test_df['dementia_status']

    # Scale data and transform data
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    y_train = y_train.apply(lambda x: 1 if x == 'Dementia' else 0)
    y_val = y_val.apply(lambda x: 1 if  x=='Dementia' else 0)
    y_test = y_test.apply(lambda x: 1 if  x=='Dementia' else 0)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [20]:
def custom_train_test_split(protein_df ,donor_ids):
    # 70, 30 Train, Test split
    # np.random.seed(42)
    train_ids, test_ids = train_test_split(donor_ids, test_size=0.30)
    # print(len(train_ids), len(test_ids))

    #samples (rna_profile_ids) by donor data splt
    train_df = protein_df[protein_df['donor_id'].isin(train_ids)]
    test_df = protein_df[protein_df['donor_id'].isin(test_ids)]

    # final data prep
    X_train = train_df.drop(columns='dementia_status')
    y_train = train_df['dementia_status']

    X_test = test_df.drop(columns='dementia_status')
    y_test = test_df['dementia_status']

    # Scale data and transform data
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    y_train = y_train.apply(lambda x: 1 if x == 'Dementia' else 0)
    y_test = y_test.apply(lambda x: 1 if  x=='Dementia' else 0)


    return X_train, y_train, X_test, y_test

In [21]:
def train_models_boot_data(protein_df,donor_ids, models, model_names, iterations=1000):
    score_names = ['Accuracy',
                   'Precision',
                   'Recall',
                   'F1',
                      ]
    scores_accuracy = []
    scores_precision = []
    scores_recall = []
    scores_f1 = []
    for i in range(iterations + 1):
        X_train, y_train, X_test, y_test = custom_train_test_split(protein_df,donor_ids)
        scores_accuracy_i = []
        scores_precision_i = []
        scores_recall_i = []
        scores_f1_i = []
        for name, model in zip(model_names, models):
            model.fit(X_train, y_train)
            preds = model.predict(X_test)

            scores_accuracy_i.append(accuracy_score(y_test, preds))
            scores_precision_i.append(precision_score(y_test, preds))
            scores_recall_i.append(recall_score(y_test, preds))
            scores_f1_i.append(f1_score(y_test,preds))

        scores_accuracy.append(scores_accuracy_i)
        scores_precision.append(scores_precision_i)
        scores_recall.append(scores_recall_i)
        scores_f1.append(scores_f1_i)

    scores_list = [scores_accuracy, scores_precision, scores_recall, scores_f1]
    model_scores_dict = {}
    for name, scores in zip(score_names, scores_list):
        model_scores_dict[name] = pd.DataFrame(scores, columns= model_names)

    return model_scores_dict

In [22]:
def subplot_plot_model_scores(model_scores, protein_df, score_name, ax=None):
    sort_order = np.argsort(model_scores.mean().values)
    descending_sort = sort_order[::-1]
    sorted_labels = model_scores.iloc[:, descending_sort].columns

    # Use the provided subplot or create a new one
    if ax is None:
        plt.figure(figsize=(10, 6))
        ax = plt.gca()

    sns.boxplot(model_scores.iloc[:, descending_sort], orient='h', palette='Spectral', ax=ax)
    y_labels = sorted_labels
    ax.set_yticks(range(len(y_labels)))
    ax.set_yticklabels(y_labels)
    ax.set_xlabel(f'{score_name} Score')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.axvline(0.5,color="gray",linestyle="--")

In [23]:
# Using nan dropped data

donor_ids = list(protein_df_drop_cleaned['donor_id'].unique())
protein_df = protein_df_drop_cleaned.drop(columns = ['donor_name', 'structure_id','structure_acronym'])

#protein_df

# Model Tuning

Based on work from our quantitative exploration of proteins, we found that Random Forest and Gradient Boosting gave the best results. We will now tune the hyperparameters of the models to see if we can increase improvement.

# Random Forest

In [24]:
# Random Hyperparameter Grid
# Grid Search with Cross Validation
# Tutorial from https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74




In [25]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,5,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [27]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val_split(protein_df, donor_ids)

rf = RandomForestClassifier(random_state=42)

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, scoring = 'f1', 
                               cv = 10, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=2, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=False, max_depth=30,

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.2s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.6s
[CV] END bootstrap=False, max_depth=60, 

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.8s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.5s
[CV] END bootstrap=False, max_depth=60, 

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.9s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.8s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.8s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100,

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.9s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.8s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.2s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100,

[CV] END bootstrap=False, max_depth=70, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=1400; total time=   2.2s
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time= 4.6min
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   0.4s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1800; total time=   2.9s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1200; total time=   2.4s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1200; total time=   2.3s
[CV] END bootstrap=False, max_depth

[CV] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   3.9s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.8s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1200; total time=   1.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=1600; total time=   2.7s
[CV] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=800; total time=   1.1s
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=1000; total time=   1.5s
[CV] END bootstrap=False, max_depth=10

[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=40, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=   0.0s
[CV] END bootstrap=True, max_depth=30, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=1400; total time=   0.0s
[CV] END bootstrap=True, max_depth=

920 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
419 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_

In [28]:
best_rf_params = rf_random.best_params_
best_rf_params

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': False}

In [29]:
RF_list = [RandomForestClassifier(random_state=42),RandomForestClassifier(n_estimators = 200,
                                                                         min_samples_split = 5,
                                                                         min_samples_leaf = 1,
                                                                         max_features = 'sqrt',
                                                                         max_depth = 70,
                                                                         bootstrap = False)]

train_models(RF_list, ['Default RF', 'Tuned RF'], X_train, y_train, X_val, y_val)


Unnamed: 0,accuracy,precision,recall,f1
Default RF,0.776119,0.64,0.727273,0.680851
Tuned RF,0.761194,0.607143,0.772727,0.68


In [30]:
train_models(RF_list, ['Default RF', 'Tuned RF'], X_train, y_train, X_test, y_test)


Unnamed: 0,accuracy,precision,recall,f1
Default RF,0.655914,0.896552,0.472727,0.619048
Tuned RF,0.612903,0.806452,0.454545,0.581395


In [42]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [50, 60, 70, 80, 90],
    'max_features': [10,15,20,25],
    'min_samples_leaf': [1,2,3],
    'min_samples_split': [1,3,5,7,9],
    'n_estimators': [50, 100, 200, 300, 400]
}
# Create a based model
rf = RandomForestClassifier(random_state = 42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [43]:
grid_search.fit(X_train, y_train)
best_gridsearch_param = grid_search.best_params_

Fitting 5 folds for each of 1500 candidates, totalling 7500 fits
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=300; total time=

[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=300; total time=   1.2s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=7, n_estimators=400; total time=   1.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=2, min_samples_split=9, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=80, max_features=20

[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=80, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=80, max_features=20, m

[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=200; total time=   0.8s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=80, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=80, max_features=25, 

[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=3, n_estimators=200; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.5s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, max_features=10

[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=200; total time=   0.8s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=90, max_features=20

[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=90, max_features=20,

[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=50, max_features=15

[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   1.8s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=3, n_estimators=400; total time=   1.5s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   1.2s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=90, max_features=25

[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=7, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=90, max_features=25

[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=7, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=7, n_estimators=400; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=9, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=9, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=1, min_samples_split=1, n_estimators=50; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min

[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=2, min_samples_split=9, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=3, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=50, max_features=20, m

[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=300; total time=   1.1s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=2, min_samples_split=9, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=25, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=90, max_features=25

[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=60, max_features=20

[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=7, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=50, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=50, max_features=20

[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=3, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=3, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=3, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=60, max_features=15, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=60, max_features=15, min

[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=400; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=10, min_samples_leaf=3, min_samples_split=5, n_estimators=400; total time=   0.9s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=1, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=1, min_samples_split=3, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=1, min_samples_split=3, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=15,

[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=3, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=5, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=60, max_features=20, min_samples_leaf=3, min_samples_split=7, n_estimators=300; total time=   0.9s
[CV] END bootstrap=False, max_depth=60, max_features=20

1500 fits failed out of a total of 7500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1500 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validat

In [44]:
best_gridsearch_param

{'bootstrap': False,
 'max_depth': 50,
 'max_features': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

In [45]:
{'bootstrap': False,
 'max_depth': 50,
 'max_features': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

RF_list = [RandomForestClassifier(random_state=42),RandomForestClassifier(n_estimators = 100,
                                                                         min_samples_split = 3,
                                                                         min_samples_leaf = 3,
                                                                         max_features = 10,
                                                                         max_depth = 50,
                                                                         bootstrap = False)]

train_models(RF_list, ['Default RF', 'Tuned RF'], X_train, y_train, X_val, y_val)


Unnamed: 0,accuracy,precision,recall,f1
Default RF,0.776119,0.64,0.727273,0.680851
Tuned RF,0.746269,0.592593,0.727273,0.653061


In [46]:
train_models(RF_list, ['Default RF', 'Tuned RF'], X_train, y_train, X_test, y_test)


Unnamed: 0,accuracy,precision,recall,f1
Default RF,0.655914,0.896552,0.472727,0.619048
Tuned RF,0.591398,0.774194,0.436364,0.55814


[CV] END bootstrap=False, max_depth=70, max_features=25, min_samples_leaf=3, min_samples_split=7, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=70, max_features=25, min_samples_leaf=3, min_samples_split=9, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=70, max_features=25, min_samples_leaf=3, min_samples_split=9, n_estimators=400; total time=   1.3s
[CV] END bootstrap=False, max_depth=80, max_features=10, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.0s
[CV] END bootstrap=False, max_depth=80, max_features=10, min_samples_leaf=1, min_samples_split=3, n_estimators=400; total time=   1.1s
[CV] END bootstrap=False, max_depth=80, max_features=10, min_samples_leaf=1, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=80, max_features=10, min_samples_leaf=1, min_samples_split=7, n_estimators=50; total time=   0.1s
[CV] END bootstrap=False, max_depth=80, max_features=10, 

[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=3, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=3, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=3, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=3, n_estimators=400; total time=   1.2s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=7, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=7, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=7, n_estimators=300; total time=   0.8s
[CV] END bootstrap=False, max_depth=70, max_features=15

[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=100; total time=   0.4s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=200; total time=   0.8s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=1, min_samples_split=9, n_estimators=400; total time=   1.4s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=2, min_samples_split=3, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=2, min_samples_split=3, n_estimators=300; total time=   1.0s
[CV] END bootstrap=False, max_depth=90, max_features=20, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=90, max_features=20

[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15, min_samples_leaf=3, min_samples_split=1, n_estimators=400; total time=   0.0s
[CV] END bootstrap=False, max_depth=70, max_features=15

[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=100; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=200; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10, min_samples_leaf=3, min_samples_split=1, n_estimators=300; total time=   0.0s
[CV] END bootstrap=False, max_depth=90, max_features=10

# Gradient Boosting

In [58]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val_split(protein_df, donor_ids)

param_grid = {
    "learning_rate": (0.01, 0.025, 0.1, 1, 10),
    "max_leaf_nodes": (3, 10, 30),
    "n_estimators": (10,50,100,500),
    "min_samples_split": (1,2,5,10)
}

model = GradientBoostingClassifier(random_state=42)
model_grid_search = GridSearchCV(model, param_grid=param_grid, n_jobs=2, cv=2)
model_grid_search.fit(X_train, y_train)
model_grid_search.best_params_

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/tonylan/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_p

{'learning_rate': 0.1,
 'max_leaf_nodes': 3,
 'min_samples_split': 2,
 'n_estimators': 10}

In [59]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_val_split(protein_df, donor_ids)


GB_list = [GradientBoostingClassifier(random_state=42),GradientBoostingClassifier(learning_rate = 0.1,
                                                                         max_leaf_nodes = 3,
                                                                         min_samples_leaf = 2,
                                                                         n_estimators = 10)]

train_models(GB_list, ['Default GB', 'Tuned GB'], X_train, y_train, X_val, y_val)


Unnamed: 0,accuracy,precision,recall,f1
Default GB,0.686567,0.515152,0.772727,0.618182
Tuned GB,0.671642,0.5,0.681818,0.576923


In [60]:
train_models(GB_list, ['Default GB', 'Tuned GB'], X_train, y_train, X_test, y_test)


Unnamed: 0,accuracy,precision,recall,f1
Default GB,0.602151,0.78125,0.454545,0.574713
Tuned GB,0.580645,0.75,0.436364,0.551724
