<a href="https://colab.research.google.com/github/ctrivino1/YLearn/blob/main/Feature_Importance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **4 ways to find feature importance**

## 1.   **Correlation Matrix:**
- Provides information about linear relationships between features. Useful for identifying multicollinearity.

## 2.   **XGBoost feature importance:**
- Is based on the internal structure and performance of the model, reflecting how often and how much each feature is used in the ensemble of trees.

## 3.    **Permutation importance:**
- Measures the decrease in model performance when the values of a specific feature are randomly shuffled.

##4.   **Shap feature importance:**
- SHAP values provide insights not only into feature importance but also into the direction and impact of each feature on individual predictions.
- Shap values can also be calculated with permutation, but it was much more computationally expensive compared to sklearn's permutation_importance functionality







In [None]:
# Import necessary libraries
# correlation matrix
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Load California housing dataset
california_housing = fetch_california_housing()
X = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
y = california_housing.target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit XGBoost Regressor
xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

# Get feature importances
feature_importances = xgb.feature_importances_
y_pred = xgb.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared:.4f}')

import seaborn as sns
import matplotlib.pyplot as plt

def correlation_heatmap(data):
    correlations = data.corr()

    fig, ax = plt.subplots(figsize=(10, 10))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f', cmap="YlGnBu",
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70}
                )
    plt.show()

# Assuming X_train is your training data
correlation_heatmap(X_train)



In [None]:
# xgb feature importance WITHOUT NORMALIZATION
# Visualize importances
sorted_idx = feature_importances.argsort()
plt.barh(X.columns[sorted_idx], feature_importances[sorted_idx])
plt.xlabel("XGBoost Feature Importance")
plt.show()

In [None]:
# xgbregresssor WITH NORMALIZATION
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score


# Initialize the scaler
scaler = StandardScaler()

# Fit and transform on training data (the splits were done in the first code block)
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data (using the same scaling factors)
X_test_scaled = scaler.transform(X_test)

# Fit XGBoost Regressor on scaled data
xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train_scaled, y_train)

# Get feature importances
feature_importances = xgb.feature_importances_
y_pred = xgb.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

r_squared = r2_score(y_test, y_pred)
print(f'R-squared: {r_squared:.4f}')


# Visualize importances
sorted_idx = feature_importances.argsort()
plt.barh(X.columns[sorted_idx], feature_importances[sorted_idx])
plt.xlabel("XGBoost Feature Importance")
plt.show()

In [None]:
# Permutation feature importance
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt


# Perform permutation feature importance
perm_importance = permutation_importance(xgb, X_test, y_test, n_repeats=300, random_state=42)

# Visualize importances
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.show()


In [None]:
!pip install shap

In [None]:
# Shap values
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import shap

# Create a SHAP explainer
explainer = shap.TreeExplainer(xgb)

# Calculate SHAP values
shap_values = explainer.shap_values(X_test)

# Summary plot of SHAP feature importances
shap.summary_plot(shap_values, X_test, plot_type="bar")


In [None]:
# Shap permutation
import shap
from sklearn.ensemble import RandomForestRegressor  # Replace with your model import

# Assume you have a trained XGBoost Regressor
xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

# Masker function for background data (replace with your own if needed)
background_data = shap.maskers.Independent(X_train, max_samples=100)

# Create a PermutationExplainer
explainer = shap.PermutationExplainer(xgb.predict, background_data)  # Pass the predict method

# Explain Shapley values for the test set
shap_values = explainer.shap_values(X_test, npermutations=3)

# Explain a single row
#row_index = 0  # Replace with the index of the row you want to explain
#shap_values = explainer.shap_values(X.iloc[row_index, :])


# Visualize the explanations
shap.summary_plot(shap_values, X_test, plot_type="bar")
shap.summary_plot(shap_values, X_test)  # Adjust parameters as needed
