<a href="https://colab.research.google.com/github/efitzgerald763/Blood_brain_models/blob/Optimize_tree_models/XGBoost_Gradient_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
# Connects to your Google Drive so you can import files
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd

file_path = '/content/drive/My Drive/Colab Notebooks/Blood_brain_pred/ENSG00000096060_blood_brain.csv'
data = pd.read_csv(file_path)

# Set the index to the first column
data.set_index(data.columns[0], inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
from sklearn.model_selection import train_test_split

# Transpose the dataframe so each gene is a feature and each sample is a column
data_transposed = data.T

data_transposed.sample(4)

# Separate the target variable
target_row = 'ENSG00000096060'
y = data_transposed[target_row]
X = data_transposed.drop(columns=[target_row])

# Check the shapes to ensure they are as expected
print(X.shape)
print(y.shape)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

(81, 18706)
(81,)


In [7]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100],
    'learning_rate': [0.001, 0.01],
    'max_depth': [3, 5]
}

# Initialize Gradient Boosting model
gb = GradientBoostingRegressor(random_state=42)

# Perform Grid Search for Gradient Boosting
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_gb.fit(X_train, y_train)

# Print the best parameters and score for Gradient Boosting
print(f'Best Parameters for Gradient Boosting: {grid_search_gb.best_params_}')
print(f'Best Score for Gradient Boosting: {grid_search_gb.best_score_}')

# Use the best estimator to predict and evaluate for Gradient Boosting
best_gb = grid_search_gb.best_estimator_
y_pred_gb = best_gb.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print(f'Mean Squared Error for Gradient Boosting: {mse_gb}')
print(f'R-squared for Gradient Boosting: {r2_gb}')



Best Parameters for Gradient Boosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Score for Gradient Boosting: -0.5824742788899331
Mean Squared Error for Gradient Boosting: 0.398871261892831
R-squared for Gradient Boosting: 0.26343537990002686


In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}

# Initialize XGBoost model
xgb_model = xgb.XGBRegressor(random_state=42)

# Perform Grid Search for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and score for XGBoost
print(f'Best Parameters for XGBoost: {grid_search_xgb.best_params_}')
print(f'Best Score for XGBoost: {grid_search_xgb.best_score_}')

# Use the best estimator to predict and evaluate for XGBoost
best_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'Mean Squared Error for XGBoost: {mse_xgb}')
print(f'R-squared for XGBoost: {r2_xgb}')


In [16]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Step 1: Tune max_depth and min_child_weight
param_grid_1 = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5]
}

xgb_model_1 = xgb.XGBRegressor(learning_rate=0.1, n_estimators=50, random_state=42)
grid_search_1 = GridSearchCV(estimator=xgb_model_1, param_grid=param_grid_1, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_1.fit(X_train, y_train)

best_params_1 = grid_search_1.best_params_
print(f'Best Parameters for Group 1: {best_params_1}')



Best Parameters for Group 1: {'max_depth': 3, 'min_child_weight': 3}


In [17]:
# Step 2: Tune subsample and colsample_bytree
param_grid_2 = {
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_model_2 = xgb.XGBRegressor(
    learning_rate=0.1, n_estimators=50,
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    random_state=42
)
grid_search_2 = GridSearchCV(estimator=xgb_model_2, param_grid=param_grid_2, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_2.fit(X_train, y_train)

best_params_2 = grid_search_2.best_params_
print(f'Best Parameters for Group 2: {best_params_2}')


Best Parameters for Group 2: {'colsample_bytree': 0.8, 'subsample': 0.6}


In [18]:
# Step 3: Tune learning_rate and n_estimators
param_grid_3 = {
    'learning_rate': [0.01, 0.05],
    'n_estimators': [50, 100, 150]
}

xgb_model_3 = xgb.XGBRegressor(
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    subsample=best_params_2['subsample'],
    colsample_bytree=best_params_2['colsample_bytree'],
    random_state=42
)
grid_search_3 = GridSearchCV(estimator=xgb_model_3, param_grid=param_grid_3, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_3.fit(X_train, y_train)

best_params_3 = grid_search_3.best_params_
print(f'Best Parameters for Group 3: {best_params_3}')


Best Parameters for Group 3: {'learning_rate': 0.05, 'n_estimators': 100}


In [15]:
final_xgb_model = xgb.XGBRegressor(
    max_depth=best_params_1['max_depth'],
    min_child_weight=best_params_1['min_child_weight'],
    subsample=best_params_2['subsample'],
    colsample_bytree=best_params_2['colsample_bytree'],
    learning_rate=best_params_3['learning_rate'],
    n_estimators=best_params_3['n_estimators'],
    random_state=42
)

final_xgb_model.fit(X_train, y_train)
y_pred = final_xgb_model.predict(X_test)

# Evaluate final model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Final Model MSE: {mse}')
print(f'Final Model R-squared: {r2}')


Final Model MSE: 0.43788071007247586
Final Model R-squared: 0.1913996577915995
