In [0]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import shap
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# Enable inline plotting for Databricks
%matplotlib inline


In [0]:
source = "abfss://raw@cloudinfrastg.dfs.core.windows.net/00_data_source/"
data = "housing.csv"
import pandas as pd

housing = spark.read.csv(source + data, header=True, inferSchema=True)   
housing.display() 

In [0]:
housing_pd = housing.toPandas ()

In [0]:
housing_pd.count()

In [0]:
# Step 1: Preprocess the data
# One-hot encode the categorical "ocean_proximity" feature
encoder = OneHotEncoder(sparse_output=False, drop="if_binary")  # Avoid sparse matrix and handle binary automatically
encoded_features = encoder.fit_transform(housing_pd[["ocean_proximity"]])
encoded_feature_names = encoder.get_feature_names_out(["ocean_proximity"])

# Create a DataFrame for encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Add encoded features to the original DataFrame and drop the original column
housing_pd = pd.concat([housing_pd, encoded_df], axis=1).drop(columns=["ocean_proximity"])

# Separate features (X) and target (y)
X = housing_pd.drop(columns=["median_house_value"])
y = housing_pd["median_house_value"]

# Display the dataset
display(X)

# Step 2: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a RandomForestRegressor model
model = RandomForestRegressor(random_state=42, n_estimators=10, )
model.fit(X_train, y_train)

In [0]:
# Step 5: Use SHAP to analyze feature importance
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)

# Calculate SHAP values
shap_values = explainer.shap_values(X_test)



In [0]:
# Step 6: Visualize SHAP values
# Global importance summary plot
shap.summary_plot(shap_values, X_test, feature_names=X_test.columns)



In [0]:
# Force plot for an individual prediction (e.g., the first test sample)
import shap
shap.initjs() 
shap.force_plot(explainer.expected_value, shap_values[0], X_test.iloc[0], feature_names=X_test.columns) 

In [0]:
X_test.iloc[0]