In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, max_error
import shap

# Assuming you have a CSV file as your dataset
data = pd.read_csv('yourfile.csv')

# Handle missing values (if any)
data = data.dropna()

# Encode categorical variables (assuming they are object types)
data_encoded = pd.get_dummies(data, drop_first=True)

# Split the data into features (X) and target variable (y)
X = data_encoded.drop('Overall_Survival_Months', axis=1)  # Features
y = data_encoded['Overall_Survival_Months']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning using RandomizedSearchCV

# Random Forest
param_dist_rf = {
    'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300, 350, 500, 750, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20, 50, 100],
    'min_samples_leaf': [1, 2, 4, 8, 12, 15, 20]
}
rf_model = RandomForestRegressor(random_state=42)
random_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_dist_rf, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_rf.fit(X_train_scaled, y_train)
best_rf_model = random_search_rf.best_estimator_

# Gradient Boosting
param_dist_gb = {
    'n_estimators': [10, 20, 50, 100, 150, 200, 250, 300, 350, 500, 750, 1000],
    'max_depth': [3, 5, 7, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20, 50, 100],
    'min_samples_leaf': [1, 2, 4, 8, 12, 15, 20],
    'learning_rate': [0.01, 0.1, 0.2, 0.5]
}
gb_model = GradientBoostingRegressor(random_state=42)
random_search_gb = RandomizedSearchCV(gb_model, param_distributions=param_dist_gb, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search_gb.fit(X_train_scaled, y_train)
best_gb_model = random_search_gb.best_estimator_

# MLP Regression
mlp_model = MLPRegressor(random_state=42)
mlp_model.fit(X_train_scaled, y_train)

#other models
LR_model = LinearRegression()
LR_model.fit(X_train_scaled, y_train)

Lasso_model = Lasso()
Lasso_model.fit(X_train_scaled, y_train)

SVR_model = SVR()
SVR_model.fit(X_train_scaled, y_train)

KNR_model = KNeighborsRegressor()
KNR_model.fit(X_train_scaled, y_train)

DTR_model = DecisionTreeRegressor()
DTR_model.fit(X_train_scaled, y_train)

# Define models and model_names
models = [best_rf_model, best_gb_model, mlp_model, LR_model, Lasso_model, SVR_model, KNR_model, DTR_model]
model_names = ['Random Forest', 'Gradient Boosting', 'MLP', 'LinearRegression', 'Lasso', 'SVR', 'KNeighborsRegressor', 'DecisionTreeRegressor']

# Model Evaluation with Cross-Validation
for model, name in zip(models, model_names):
    predictions = model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    medae = median_absolute_error(y_test, predictions)
    maxe = max_error(y_test, predictions)

    print(f"\n{name} Metrics:")
    print("Mean Squared Error:", mse)
    print("Mean Absolute Error:", mae)
    print("Median Absolute Error:", medae)
    print("Max Error:", maxe)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    
    # Print cross-validation results
    print(f"\n{name} Cross-Validation Scores:")
    print("Mean MSE:", -cv_scores.mean())
    print("Std MSE:", cv_scores.std())
    
    # Scatter Plot
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, predictions)
    plt.title(f'{name} - Predicted vs Actual')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.xlim([0, max(y_test)+1])
    plt.ylim([0, max(y_test)+1])
    plt.gca().set_aspect('equal', adjustable='box')  # Set aspect ratio to be equal
    plt.show()
    
# Example for using Shapley values with a model
explainerrf = shap.Explainer(best_rf_model)
shap_valuesrf = explainerrf.shap_values(X_test_scaled)
shap.summary_plot(shap_valuesrf, X_test_scaled, feature_names=X.columns)

explainergb = shap.Explainer(best_gb_model)
shap_valuesgb = explainergb.shap_values(X_test_scaled)
shap.summary_plot(shap_valuesgb, X_test_scaled, feature_names=X.columns)

In [None]:
from sklearn.ensemble import VotingRegressor

# Best-performing models
best_rf_model = random_search_rf.best_estimator_
best_gb_model = random_search_gb.best_estimator_

# Create ensemble using VotingRegressor
ensemble_model = VotingRegressor([
    ('RandomForest', best_rf_model),
    ('GradientBoosting', best_gb_model)
])

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

# Evaluate the ensemble model
predictions_ensemble = ensemble_model.predict(X_test_scaled)

mse_ensemble = mean_squared_error(y_test, predictions_ensemble)
mae_ensemble = mean_absolute_error(y_test, predictions_ensemble)
medae_ensemble = median_absolute_error(y_test, predictions_ensemble)
maxe_ensemble = max_error(y_test, predictions_ensemble)

print("\nEnsemble Model Metrics:")
print("Mean Squared Error:", mse_ensemble)
print("Mean Absolute Error:", mae_ensemble)
print("Median Absolute Error:", medae_ensemble)
print("Max Error:", maxe_ensemble)

# Scatter Plot for Ensemble Model
plt.figure(figsize=(8, 8))
plt.scatter(y_test, predictions_ensemble)
plt.title('Ensemble Model - Predicted vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.xlim([0, max(y_test)+1])
plt.ylim([0, max(y_test)+1])
plt.gca().set_aspect('equal', adjustable='box')  # Set aspect ratio to be equal
plt.show()

In [None]:
# Hyperparameter Tuning for Random Forest Regressor
param_grid = {
    'n_estimators': [10, 15, 20, 25, 50, 75, 100, 150, 200, 250, 500, 750, 1000],
    'max_depth': [None, 1, 2, 4, 6, 8, 10, 20, 50, 100],
    'min_samples_split': [2, 4, 6, 8, 10, 15, 20, 25, 50, 75, 100],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 15, 20, 25, 50, 75, 100]
}

rf_model = RandomForestRegressor()
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Use the best model from the grid search
best_rf_model = grid_search.best_estimator_

# Feature Importance Analysis
feature_importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)

# Cross-Validation
cv_results = cross_val_score(best_rf_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse_scores = -cv_results
print("Cross-Validation Mean Squared Error:", cv_mse_scores.mean())

# Make predictions on the test set
predictions = best_rf_model.predict(X_test)

# Calculate and print regression metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
medae = median_absolute_error(y_test, predictions)
maxe = max_error(y_test, predictions)

# Print the results
print(f"Test Mean Squared Error: {mse}")
print(f"Test Mean Absolute Error: {mae}")
print(f"Test Median Absolute Error: {medae}")
print(f"Test Max Error: {maxe}")

In [None]:
# Hyperparameter Tuning for Gradient Boosting Regressor
param_grid = {
    'n_estimators': [10, 15, 20, 25, 50, 75, 100, 150, 200, 250, 500, 750, 1000],
    'max_depth': [None, 1, 2, 4, 6, 8, 10, 20, 50, 100],
    'min_samples_split': [2, 4, 6, 8, 10, 15, 20, 25, 50, 75, 100],
    'min_samples_leaf': [1, 2, 4, 6, 8, 10, 15, 20, 25, 50, 75, 100]
}

gb_model = GradientBoostingRegressor()
grid_search = GridSearchCV(gb_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Use the best model from the grid search
best_gb_model = grid_search.best_estimator_

# Feature Importance Analysis
feature_importances = best_gb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(feature_importance_df)

# Cross-Validation
cv_results = cross_val_score(best_gb_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse_scores = -cv_results
print("Cross-Validation Mean Squared Error:", cv_mse_scores.mean())

# Make predictions on the test set
predictions = best_gb_model.predict(X_test)

# Calculate and print regression metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
medae = median_absolute_error(y_test, predictions)
maxe = max_error(y_test, predictions)

# Print the results
print(f"Test Mean Squared Error: {mse}")
print(f"Test Mean Absolute Error: {mae}")
print(f"Test Median Absolute Error: {medae}")
print(f"Test Max Error: {maxe}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, max_error

# Assuming you have already defined your models (best_rf_model, best_gb_model, mlp_model, etc.)

# Define models and model_names
models = [best_rf_model, best_gb_model]
model_names = ['Random Forest', 'Gradient Boosting']

# List to store predictions from each fold
all_fold_predictions = []

# Perform cross-validated predictions and store predictions in the list
for model, name in zip(models, model_names):
    predictions_fold = cross_val_predict(model, X_train_scaled, y_train, cv=5)
    all_fold_predictions.append(predictions_fold)

# Calculate the mean prediction across all folds
mean_predictions = np.mean(all_fold_predictions, axis=0)

# Evaluate the mean predictions
mse_mean = mean_squared_error(y_train, mean_predictions)
mae_mean = mean_absolute_error(y_train, mean_predictions)
medae_mean = median_absolute_error(y_train, mean_predictions)
maxe_mean = max_error(y_train, mean_predictions)  # Added this line

print("\nMean Ensemble Model Metrics:")
print("Mean Squared Error:", mse_mean)
print("Mean Absolute Error:", mae_mean)
print("Median Absolute Error:", medae_mean)
print("Max Error:", maxe_mean)

# Scatter Plot for Mean Ensemble Model with Actual Values below 40
plt.scatter(y_train, mean_predictions)
plt.title('Mean Ensemble Model - Predicted vs Actual')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

In [None]:
# Example for using Shapley values with a model
explainerrf = shap.Explainer(best_rf_model)
shap_valuesrf = explainerrf.shap_values(X_test_scaled)
shap.summary_plot(shap_valuesrf, X_test_scaled, feature_names=X.columns)

explainergb = shap.Explainer(best_gb_model)
shap_valuesgb = explainergb.shap_values(X_test_scaled)
shap.summary_plot(shap_valuesgb, X_test_scaled, feature_names=X.columns)