# 1. Business Understanding

## Overview 

The Ministry of Water under the Republic of Tanzania has tasked the Kuria's company to analyze data on water wells around the country in order to formalize a plan and budget in order to improve access to water supply and implement water resources development in preparation for the next financial year.

## Problem Statement

The Kuria's company seeks to accomplish the following objectives from this project;

1. Identifying the factors influencing the functionality of wells.

2. Providing a strategy for building new wells to the government or donors.

3. Detecting patterns that lead to the unfunctionality of some wells.

4. Recommending a strategy on repairing non functional wells.

# 2. Data Understanding and Data Cleaning

In [120]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [121]:
#loading the features dataset and showcasing the first five rows 
training_data = pd.read_csv('./data/training_set_values.csv')
training_data.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [122]:
# Learning about the data
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [123]:
#Displaying the columns and identifying the ones necessary for tackling the problems
training_data.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [124]:
#Dropping unnecessary columns
X= training_data.drop(['id','wpt_name', 'recorded_by', 'date_recorded', 'scheme_name',
 'region_code', 'district_code', 'num_private',
 'extraction_type_group', 'management_group', 'quality_group',
 'quantity_group', 'source_type', 'source_class', 'waterpoint_type_group']
, axis=1)

X.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,basin,subvillage,region,lga,...,construction_year,extraction_type,extraction_type_class,management,payment,payment_type,water_quality,quantity,source,waterpoint_type
0,6000.0,Roman,1390,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,Ludewa,...,1999,gravity,gravity,vwc,pay annually,annually,soft,enough,spring,communal standpipe
1,0.0,Grumeti,1399,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,Serengeti,...,2010,gravity,gravity,wug,never pay,never pay,soft,insufficient,rainwater harvesting,communal standpipe
2,25.0,Lottery Club,686,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,Simanjiro,...,2009,gravity,gravity,vwc,pay per bucket,per bucket,soft,enough,dam,communal standpipe multiple
3,0.0,Unicef,263,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,Nanyumbu,...,1986,submersible,submersible,vwc,never pay,never pay,soft,dry,machine dbh,communal standpipe multiple
4,0.0,Action In A,0,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,Karagwe,...,0,gravity,gravity,other,never pay,never pay,soft,seasonal,rainwater harvesting,communal standpipe


In [125]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   funder                 55765 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   installer              55745 non-null  object 
 4   longitude              59400 non-null  float64
 5   latitude               59400 non-null  float64
 6   basin                  59400 non-null  object 
 7   subvillage             59029 non-null  object 
 8   region                 59400 non-null  object 
 9   lga                    59400 non-null  object 
 10  ward                   59400 non-null  object 
 11  population             59400 non-null  int64  
 12  public_meeting         56066 non-null  object 
 13  scheme_management      55523 non-null  object 
 14  permit                 56344 non-null  object 
 15  co

In [126]:
# Checking for duplicates
X.duplicated().sum()   

200

In [127]:
# Checking for missing values
X.isna().sum()

amount_tsh                  0
funder                   3635
gps_height                  0
installer                3655
longitude                   0
latitude                    0
basin                       0
subvillage                371
region                      0
lga                         0
ward                        0
population                  0
public_meeting           3334
scheme_management        3877
permit                   3056
construction_year           0
extraction_type             0
extraction_type_class       0
management                  0
payment                     0
payment_type                0
water_quality               0
quantity                    0
source                      0
waterpoint_type             0
dtype: int64

In [128]:
X= X.fillna('None')

The missing values are filled with 'None' in order to keep the number of rows of the X_training values and the y_training values the same in order to train the model accordingly.

In [129]:
# Confirmation that there are no missing values
X.isna().sum()

amount_tsh               0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
basin                    0
subvillage               0
region                   0
lga                      0
ward                     0
population               0
public_meeting           0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_class    0
management               0
payment                  0
payment_type             0
water_quality            0
quantity                 0
source                   0
waterpoint_type          0
dtype: int64

In [130]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   funder                 59400 non-null  object 
 2   gps_height             59400 non-null  int64  
 3   installer              59400 non-null  object 
 4   longitude              59400 non-null  float64
 5   latitude               59400 non-null  float64
 6   basin                  59400 non-null  object 
 7   subvillage             59400 non-null  object 
 8   region                 59400 non-null  object 
 9   lga                    59400 non-null  object 
 10  ward                   59400 non-null  object 
 11  population             59400 non-null  int64  
 12  public_meeting         59400 non-null  object 
 13  scheme_management      59400 non-null  object 
 14  permit                 59400 non-null  object 
 15  co

In [131]:
X.shape

(59400, 25)

In [132]:
# Loading the target variable dataset and showcasing the first five rows
y= pd.read_csv('./data/training_set_labels.csv')
y.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [133]:
# Displaying the info on the target variable dataset
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            59400 non-null  int64 
 1   status_group  59400 non-null  object
dtypes: int64(1), object(1)
memory usage: 928.2+ KB


In [134]:
# Checking for duplicates
y.duplicated().sum()

0

In [135]:
# Checking for missing values
y.isna().sum()

id              0
status_group    0
dtype: int64

In [136]:
# Confirmation on the same number of rows in both feature and target variable datasets
y.shape

(59400, 2)

In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.inspection import permutation_importance


In [None]:
# 2. Load Data
X_train = pd.read_csv("training_set_values.csv")
y_train = pd.read_csv("training_set_labels.csv")["status_group"]
X_test  = pd.read_csv("test_set_values.csv")

# Quick look
print(X_train.head())
print(y_train.value_counts())


In [None]:
## Train/Validation Split
We split training data to evaluate models before finalizing.


In [None]:
# 3. Train/Validation Split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)


In [None]:
## Data Preparation
Pipelines prevent leakage and ensure consistent preprocessing.


In [None]:
# 4. Preprocessing Pipelines
numeric_features = X_tr.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features = X_tr.select_dtypes(include=["object","category"]).columns.tolist()

# Drop IDs if present
for col in ["id", "row_id"]:
    if col in numeric_features: numeric_features.remove(col)
    if col in categorical_features: categorical_features.remove(col)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [None]:
## Baseline Model: Logistic Regression
Addresses Problem 1 (factors influencing functionality).


In [None]:
# 5. Baseline Logistic Regression
baseline_clf = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=1000, multi_class="multinomial",
                               solver="lbfgs", class_weight="balanced", random_state=42))
])

baseline_clf.fit(X_tr, y_tr)
y_val_pred = baseline_clf.predict(X_val)

print(classification_report(y_val, y_val_pred, digits=3))
print(confusion_matrix(y_val, y_val_pred))


In [None]:
## Tuned Logistic Regression
Addresses Problem 2 (strategy for building new wells).


In [None]:
# 6. Tuned Logistic Regression
param_grid = {"clf__C": [0.01, 0.1, 1, 10]}
grid = GridSearchCV(baseline_clf, param_grid, cv=5, scoring="f1_macro")
grid.fit(X_tr, y_tr)

best_logit = grid.best_estimator_
y_val_pred = best_logit.predict(X_val)

print("Best params:", grid.best_params_)
print(classification_report(y_val, y_val_pred, digits=3))


In [None]:
## Nonparametric Model: Decision Tree
Addresses Problem 3 (patterns making wells non-functional).


In [None]:
# 7. Decision Tree
tree_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=42, class_weight="balanced"))
])

tree_params = {
    "clf__max_depth": [5, 10, None],
    "clf__min_samples_split": [2, 10, 50],
    "clf__min_samples_leaf": [1, 5, 20]
}

tree_grid = GridSearchCV(tree_pipe, tree_params, cv=5, scoring="f1_macro")
tree_grid.fit(X_tr, y_tr)

best_tree = tree_grid.best_estimator_
y_val_pred_tree = best_tree.predict(X_val)

print("Best tree params:", tree_grid.best_params_)
print(classification_report(y_val, y_val_pred_tree, digits=3))


In [None]:
## Ensemble Model: Random Forest
Addresses Problem 4 (repair strategy).


In [None]:
# 8. Random Forest
rf_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42, class_weight="balanced"))
])

rf_pipe.fit(X_tr, y_tr)
y_val_pred_rf = rf_pipe.predict(X_val)

print(classification_report(y_val, y_val_pred_rf, digits=3))


In [None]:
## Evaluation Visuals
Confusion matrix heatmap for clarity.


In [None]:
# 9. Confusion Matrix Heatmap
cm = confusion_matrix(y_val, y_val_pred_rf, labels=rf_pipe.named_steps["clf"].classes_)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=rf_pipe.named_steps["clf"].classes_,
            yticklabels=rf_pipe.named_steps["clf"].classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
## Feature Importance
Permutation importance to interpret model drivers.


In [None]:
# 10. Feature Importance
r = permutation_importance(rf_pipe, X_val, y_val, n_repeats=5, random_state=42)
feature_names = rf_pipe.named_steps["prep"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feature_names, "importance": r.importances_mean})
print(imp_df.sort_values("importance", ascending=False).head(15))


In [None]:
## Final Model & Predictions
Train on full dataset and generate predictions for test set.


In [None]:
# 11. Final Model & Predictions
final_model = rf_pipe  # or best_logit/best_tree depending on evaluation
final_model.fit(X_train, y_train)

test_pred = final_model.predict(X_test)
pd.DataFrame({"prediction": test_pred}).to_csv("test_predictions.csv", index=False)


In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.inspection import permutation_importance


In [None]:
# Load data
X_train = pd.read_csv("training_set_values.csv")
y_train = pd.read_csv("training_set_labels.csv")["status_group"]
X_test  = pd.read_csv("test_set_values.csv")

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Target distribution:\n", y_train.value_counts())


In [None]:
# Quick schema overview
display(X_train.head())
X_train.info()


In [None]:
# Visualize class distribution
y_train.value_counts().plot(kind="bar", color=["green","red","orange"], rot=0)
plt.title("Class distribution (status_group)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()


In [None]:
# Columns recommended to drop (IDs, metadata, redundant groupings)
DROP_COLS = [
    "id", "wpt_name", "recorded_by", "date_recorded", "scheme_name",
    "region_code", "district_code", "num_private",
    "extraction_type_group", "management_group", "quality_group",
    "quantity_group", "source_type", "source_class", "waterpoint_type_group"
]

# Feature engineering: optional age from construction_year if date_recorded exists as year-like
# If you want age, uncomment next block and avoid dropping date_recorded above.
# if "date_recorded" in X_train.columns:
#     X_train["record_year"] = pd.to_datetime(X_train["date_recorded"], errors="coerce").dt.year
#     X_test["record_year"]  = pd.to_datetime(X_test["date_recorded"], errors="coerce").dt.year
#     X_train["age_years"] = X_train["record_year"] - X_train["construction_year"]
#     X_test["age_years"]  = X_test["record_year"]  - X_test["construction_year"]
#     DROP_COLS += ["date_recorded", "record_year"]

# Apply drops consistently
X_train = X_train.drop(columns=[c for c in DROP_COLS if c in X_train.columns])
X_test  = X_test.drop(columns=[c for c in DROP_COLS if c in X_test.columns])

# Train/validation split for model selection
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

print("Post-drop shapes -> Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)


In [None]:
# Identify feature types
numeric_features = X_tr.select_dtypes(include=["int64","float64"]).columns.tolist()
categorical_features = X_tr.select_dtypes(include=["object","category","bool"]).columns.tolist()

print("Numeric (sample):", numeric_features[:10])
print("Categorical (sample):", categorical_features[:10])


In [None]:
# Class imbalance (proportions)
class_props = y_train.value_counts(normalize=True)
print("Class proportions:\n", class_props)


In [None]:
# Multicollinearity among numeric features: drop highly correlated (threshold=0.9)
to_drop_corr = []
if len(numeric_features) > 1:
    corr_matrix = X_tr[numeric_features].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop_corr = [col for col in upper.columns if any(upper[col] > 0.9)]

print("Highly correlated numeric features to drop:", to_drop_corr)
numeric_features = [c for c in numeric_features if c not in to_drop_corr]


In [None]:
# Preprocessing: impute + scale numeric, impute + one-hot categorical
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


In [None]:
# Baseline: multinomial logistic regression (interpretable, with class weights)
logit_baseline = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(
        max_iter=1000, multi_class="multinomial", solver="lbfgs",
        class_weight="balanced", random_state=42
    ))
])

logit_baseline.fit(X_tr, y_tr)
y_val_pred_logit = logit_baseline.predict(X_val)

print("Baseline Logistic (Macro F1):", f1_score(y_val, y_val_pred_logit, average="macro"))
print(classification_report(y_val, y_val_pred_logit, digits=3))


In [None]:
# Tuned logistic regression: regularization strength (C) via CV, optimize macro F1
logit_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", LogisticRegression(
        max_iter=1000, multi_class="multinomial", solver="lbfgs",
        class_weight="balanced", random_state=42
    ))
])

param_grid_logit = {"clf__C": [0.01, 0.1, 1, 10]}
grid_logit = GridSearchCV(logit_pipe, param_grid_logit, cv=5, scoring="f1_macro", n_jobs=-1)
grid_logit.fit(X_tr, y_tr)

best_logit = grid_logit.best_estimator_
y_val_pred_best_logit = best_logit.predict(X_val)

print("Best logistic params:", grid_logit.best_params_)
print("Tuned Logistic (Macro F1):", f1_score(y_val, y_val_pred_best_logit, average="macro"))
print(classification_report(y_val, y_val_pred_best_logit, digits=3))


In [None]:
# Nonparametric: Decision Tree with tuning, class weights for imbalance
tree_pipe = Pipeline(steps=[
    ("prep", preprocessor),  # scaling isn't necessary but harmless
    ("clf", DecisionTreeClassifier(random_state=42, class_weight="balanced"))
])

param_grid_tree = {
    "clf__max_depth": [5, 10, None],
    "clf__min_samples_split": [2, 10, 50],
    "clf__min_samples_leaf": [1, 5, 20]
}

grid_tree = GridSearchCV(tree_pipe, param_grid_tree, cv=5, scoring="f1_macro", n_jobs=-1)
grid_tree.fit(X_tr, y_tr)

best_tree = grid_tree.best_estimator_
y_val_pred_tree = best_tree.predict(X_val)

print("Best tree params:", grid_tree.best_params_)
print("Decision Tree (Macro F1):", f1_score(y_val, y_val_pred_tree, average="macro"))
print(classification_report(y_val, y_val_pred_tree, digits=3))


In [None]:
# Ensemble: Random Forest (strong baseline, handles nonlinearity), class weights
rf_pipe = Pipeline(steps=[
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=300, max_depth=None, random_state=42, class_weight="balanced", n_jobs=-1
    ))
])

rf_pipe.fit(X_tr, y_tr)
y_val_pred_rf = rf_pipe.predict(X_val)

print("Random Forest (Macro F1):", f1_score(y_val, y_val_pred_rf, average="macro"))
print(classification_report(y_val, y_val_pred_rf, digits=3))


In [None]:
# Choose best model by macro F1
scores = {
    "Logistic (tuned)": f1_score(y_val, y_val_pred_best_logit, average="macro"),
    "DecisionTree (tuned)": f1_score(y_val, y_val_pred_tree, average="macro"),
    "RandomForest": f1_score(y_val, y_val_pred_rf, average="macro")
}
best_model_name = max(scores, key=scores.get)
best_model = {"Logistic (tuned)": best_logit, "DecisionTree (tuned)": best_tree, "RandomForest": rf_pipe}[best_model_name]
best_pred = {"Logistic (tuned)": y_val_pred_best_logit, "DecisionTree (tuned)": y_val_pred_tree, "RandomForest": y_val_pred_rf}[best_model_name]
print("Model selection by Macro F1:", scores, "\nBest:", best_model_name)


In [None]:
# Confusion matrix heatmap for the selected model
classes = sorted(y_train.unique())
cm = confusion_matrix(y_val, best_pred, labels=classes)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=classes, yticklabels=classes)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title(f"Confusion matrix ({best_model_name})")
plt.show()


In [None]:
# Per-class recall/precision focus (minority class performance)
print(classification_report(y_val, best_pred, digits=3))


In [None]:
# Permutation importance on validation for selected model
# Works for any scikit-learn estimator inside Pipeline
perm = permutation_importance(best_model, X_val, y_val, n_repeats=5, random_state=42, n_jobs=-1)

# Get transformed feature names
feat_names = best_model.named_steps["prep"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat_names, "importance": perm.importances_mean}) \
         .sort_values("importance", ascending=False)

imp_df.head(20)


In [None]:
# Plot top-15 feature importances
topn = 15
plt.figure(figsize=(8,6))
sns.barplot(data=imp_df.head(topn), x="importance", y="feature", orient="h", color="steelblue")
plt.title(f"Top {topn} features by permutation importance ({best_model_name})")
plt.xlabel("Importance (mean decrease in score)")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
# Refit the selected best model on the full training set
best_model.fit(X_train, y_train)

# Predict on test set
test_pred = best_model.predict(X_test)

# Save predictions for stakeholders
pd.DataFrame({"prediction": test_pred}).to_csv("test_predictions.csv", index=False)
print("Saved: test_predictions.csv")
