# Water Pump Project: Baseline Modeling Notebook

# 📌 Section 1: Overview & Purpose
"""
This notebook establishes baseline models for the Tanzania Water Pump classification project.
Purpose:
- Provide a reference point before feature engineering
- Use consistent evaluation to compare future improvements
- Enable collaborative work where each team member can track their model impact
"""

# 📦 Section 2: Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# 📂 Section 3: Load Data (Change this path for your version)

In [9]:
data = pd.read_csv("../data/processed/cleaned_merged_data.csv")

In [20]:
df = data.copy()
df = df.drop(columns=["latitude_was_outlier", "longitude_was_outlier", "extraction_mismatch"], axis=1)


# 🎯 Section 4: Define Features & Target (adjust as needed)

In [21]:
X = df.drop("status_group", axis=1)  # Replace with your actual target column
y = data["status_group"]

# 🔄 Fit vs Transform Explanation
"""
What is the difference between `fit()` and `transform()`?
- `fit()`: Learns the parameters from data (e.g., mean, std, most frequent value).
- `transform()`: Applies those learned parameters to transform the data.
- `fit_transform()`: A convenience method that combines both for training data.

Use `fit()` only on the training set. Then `transform()` both training and validation/test using the learned parameters.
This avoids data leakage.
"""

# 🥪 Section 4.1: Fit vs Transform Code Examples
"""
Example: StandardScaler on training and validation sets
"""

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit only on training data
scaler.fit(X_train)

# Transform both sets using the same scaler
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [None]:
"""
Example: OneHotEncoder and SimpleImputer with training/validation data
"""
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Fit on training, transform both
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_val_processed = preprocessor.transform(X_val)

# 🌐 Section 4.2: Preprocessing Function
"""
Reusable preprocessing function for training and validation data
"""

In [None]:
def preprocess_data(X_train, X_val):
    numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    preprocessor.fit(X_train)
    X_train_processed = preprocessor.transform(X_train)
    X_val_processed = preprocessor.transform(X_val)

    return X_train_processed, X_val_processed, preprocessor

# Example usage:
# X_train_proc, X_val_proc, processor = preprocess_data(X_train, X_val)

In [None]:
"""
Function to preprocess new (e.g. test) data using fitted preprocessor
"""
def preprocess_new_data(X_new, fitted_preprocessor):
    return fitted_preprocessor.transform(X_new)

# 🥪 Section 5: Stratified Train/Validation Split
# Stratification ensures the class proportions are maintained in both training and validation sets.
# This is important for fair baseline evaluation before applying class imbalance techniques (like SMOTE).
# Do not apply oversampling to the validation set — this should reflect the original distribution.
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Class distribution in training:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation:")
print(y_val.value_counts(normalize=True))


# 🧪 Section 5: Stratified Train/Validation Split

- Stratification ensures the class proportions are maintained in both training and validation sets.
- This is important for fair baseline evaluation before applying class imbalance techniques (like SMOTE).
- Do not apply oversampling to the validation set — this should reflect the original distribution.

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [17]:
print("Class distribution in training:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation:")
print(y_val.value_counts(normalize=True))

Class distribution in training:
status_group
functional                 0.545089
non functional             0.386636
functional needs repair    0.068275
Name: proportion, dtype: float64

Class distribution in validation:
status_group
functional                 0.545107
non functional             0.386646
functional needs repair    0.068247
Name: proportion, dtype: float64


# 🧠 Section 6: Dummy Classifier (Naive Baseline)

In [24]:
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
y_dummy = dummy.predict(X_val)
print("\nDummy Classifier Report:")
print(classification_report(y_val, y_dummy))


Dummy Classifier Report:
                         precision    recall  f1-score   support

             functional       0.55      1.00      0.71      6278
functional needs repair       0.00      0.00      0.00       786
         non functional       0.00      0.00      0.00      4453

               accuracy                           0.55     11517
              macro avg       0.18      0.33      0.24     11517
           weighted avg       0.30      0.55      0.38     11517



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 🌳 Section 7: Basic Models (Logistic & Tree)

In [25]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f"\n{name} Report:")
    print(classification_report(y_val, y_pred))
    results.append({
        "Model": name,
        "Accuracy": model.score(X_val, y_val),
        "F1 Macro": cross_val_score(model, X, y, cv=5, scoring="f1_macro").mean()
    })

ValueError: could not convert string to float: 'Germany Republi'

# 🧮 Section 8: Evaluation Table

In [None]:
results_df = pd.DataFrame(results)
print("\nBenchmark Comparison Table:")
print(results_df)

# 📊 Section 9: Confusion Matrix Example

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ConfusionMatrixDisplay.from_estimator(models["Decision Tree"], X_val, y_val, ax=ax)
plt.title("Decision Tree Confusion Matrix")
plt.show()

# 🧪 Section 10: Feature Testing Cell
"""
Use this cell to test modified datasets:
- Replace X and y with your processed versions
- Re-run training and evaluation to see improvement
"""
# Example:
# X_new = processed_data.drop("status_group", axis=1)
# y_new = processed_data["status_group"]
# Then re-run model training and update benchmark table

# 📝 Notes:
# - Make sure all team members use the same seed (random_state=42)
# - Save your results to outputs/ folder if needed