In [20]:
import pandas as pd
# save filepath to variable for easier access
pump_file_path = '../data/processed/pump_dataset.csv'
# read the data and store data in DataFrame titled pomp_data
pump_data = pd.read_csv(pump_file_path) 
# print a summary of the data in water pomp data
pump_data.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [21]:
#drop_columns = ['id','recorded_by', 'num_private', 'payment']

In [22]:
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = pump_data.status_group
X = pump_data.drop(['status_group'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [23]:
X_train.head()


Unnamed: 0,basin,public_meeting,recorded_by,permit,extraction_type_class,management_group,payment,payment_type,water_quality,quality_group,...,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
55424,Pangani,True,GeoData Consultants Ltd,True,gravity,user-group,never pay,never pay,soft,good,...,70623,0.0,1565,38.267292,-4.783454,0,4,1,1,1994
28170,Internal,False,GeoData Consultants Ltd,False,submersible,user-group,pay per bucket,per bucket,salty,salty,...,55587,50.0,1522,36.56155,-5.309293,0,21,4,1,1997
20007,Internal,,GeoData Consultants Ltd,True,gravity,user-group,never pay,never pay,soft,good,...,7929,0.0,1838,35.269396,-3.217454,0,2,5,180,1974
7842,Pangani,True,GeoData Consultants Ltd,True,gravity,commercial,never pay,never pay,soft,good,...,39061,0.0,1278,37.624801,-3.141919,0,3,1,1,1988
22217,Lake Tanganyika,True,GeoData Consultants Ltd,True,gravity,user-group,pay monthly,monthly,soft,good,...,44902,250.0,1242,29.904637,-4.534989,0,16,2,320,1999


In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)

In [26]:
from sklearn.metrics import accuracy_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('acc:', score)

acc: 0.8005050505050505


In [27]:
print(my_pipeline)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  ['id', 'amount_tsh',
                                                   'gps_height', 'longitude',
                                                   'latitude', 'num_private',
                                                   'region_code',
                                                   'district_code',
                                                   'population',
                                                   'construction_year']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
   

In [28]:
X_train.head()

Unnamed: 0,basin,public_meeting,recorded_by,permit,extraction_type_class,management_group,payment,payment_type,water_quality,quality_group,...,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
55424,Pangani,True,GeoData Consultants Ltd,True,gravity,user-group,never pay,never pay,soft,good,...,70623,0.0,1565,38.267292,-4.783454,0,4,1,1,1994
28170,Internal,False,GeoData Consultants Ltd,False,submersible,user-group,pay per bucket,per bucket,salty,salty,...,55587,50.0,1522,36.56155,-5.309293,0,21,4,1,1997
20007,Internal,,GeoData Consultants Ltd,True,gravity,user-group,never pay,never pay,soft,good,...,7929,0.0,1838,35.269396,-3.217454,0,2,5,180,1974
7842,Pangani,True,GeoData Consultants Ltd,True,gravity,commercial,never pay,never pay,soft,good,...,39061,0.0,1278,37.624801,-3.141919,0,3,1,1,1988
22217,Lake Tanganyika,True,GeoData Consultants Ltd,True,gravity,user-group,pay monthly,monthly,soft,good,...,44902,250.0,1242,29.904637,-4.534989,0,16,2,320,1999


## Second Pipeline

In [29]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.experimental import enable_iterative_imputer  # This enables IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

# Transformer: Replace 0 with random longitude
class LongitudeRandomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, column='longitude', min_val=29.2, max_val=40.4):
        self.column = column
        self.min_val = min_val
        self.max_val = max_val

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].replace(0, np.nan)
        X[self.column] = X[self.column].apply(
            lambda x: np.random.uniform(self.min_val, self.max_val) if pd.isna(x) else x
        )
        return X

# Transformer: Group-based mode imputation with global fallback
class GroupModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, col, group_cols):
        self.col = col
        self.group_cols = group_cols
        self.global_mode = None

    
    def fit(self, X, y=None):
        if self.col not in X.columns:
            raise ValueError(f"Column '{self.col}' not found in input data. Make sure it's included before using this imputer.")
        self.global_mode = X[self.col].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        try:
            X[self.col] = X.groupby(self.group_cols)[self.col].transform(
                lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x
            )
        except Exception as e:
            print(f"Warning during group imputation of {self.col}: {e}")
        X[self.col].fillna(self.global_mode, inplace=True)
        return X

# Transformer: Iterative imputer for numeric columns
class IterativeNumericImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.imputer = IterativeImputer(estimator=BayesianRidge(), random_state=42, max_iter=10)

    def fit(self, X, y=None):
        self.imputer.fit(X[self.columns])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.imputer.transform(X[self.columns])
        return X


In [30]:
from sklearn.pipeline import Pipeline

# List of all imputers and their logic
imputation_pipeline = Pipeline(steps=[
    ('longitude_random', LongitudeRandomImputer()),

    ('funder_impute', GroupModeImputer('funder', ['region', 'lga', 'ward'])),
    ('installer_impute', GroupModeImputer('installer', ['region', 'funder'])),
    ('subvillage_impute', GroupModeImputer('subvillage', ['lga', 'ward'])),
    ('public_meeting_impute', GroupModeImputer('public_meeting', ['region', 'ward'])),
    ('scheme_name_impute', GroupModeImputer('scheme_name', ['basin', 'region', 'longitude', 'latitude'])),
    ('scheme_management_impute', GroupModeImputer('scheme_management', ['scheme_name', 'source', 'region'])),
    ('permit_impute', GroupModeImputer('permit', ['region', 'source'])),

    ('iterative_numeric', IterativeNumericImputer(['construction_year', 'population', 'latitude', 'longitude', 'gps_height']))
])


In [31]:
from sklearn.model_selection import train_test_split

# Separate target from predictors
y = pump_data.status_group
X = pump_data.drop(['status_group'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [32]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)

In [33]:
from sklearn.metrics import accuracy_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', imputation_pipeline),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('acc:', score)

ValueError: Column 'funder' not found in input data. Make sure it's included before using this imputer.