# Predicting Water Pump Functionality in Tanzania (HydroLogic Project)
Authored by Farzaneh Gerami, Subaye Opoku-Acquah & Mariam Farda | 
March – June 2025

# Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy.stats import f_oneway

import math

import datetime

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from scipy.stats import chi2_contingency, ttest_ind # checking if the data is random or depends on other

import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin



# Sprint 1 - Data Cleaning and Preprocessing

## Read cleaned Data

In [2]:
# Read Merged data as an original data
url_merge_data = "../data/processed/cleaned_Training_data.csv"
df = pd.read_csv(url_merge_data)
df.head()

Unnamed: 0,amount_tsh,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,region,...,quantity,source,source_class,waterpoint_type,status_group,year_recorded,month_recorded,dayofweek_recorded,funder_installer,subvillage_funder_installer
0,6000.0,roman,1390.0,roman,34.938093,-9.856322,none,Lake Nyasa,roman,Iringa,...,enough,spring,groundwater,communal standpipe,functional,2011,3,0,roman_roman,roman_roman_roman
1,0.0,grumeti,1399.0,grumeti,34.698766,-2.147466,Zahanati,Lake Victoria,grumeti,Mara,...,insufficient,rainwater harvesting,surface,communal standpipe,functional,2013,3,2,grumeti_grumeti,grumeti_grumeti_grumeti
2,25.0,lottery club,686.0,world vision,37.460664,-3.821329,Kwa Mahundi,Pangani,lottery club,Manyara,...,enough,dam,surface,communal standpipe multiple,functional,2013,2,0,lottery club_world vision,lottery club_lottery club_world vision
3,0.0,unicef,263.0,unicef,38.486161,-11.155298,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,unicef,Mtwara,...,dry,machine dbh,groundwater,communal standpipe multiple,non functional,2013,1,0,unicef_unicef,unicef_unicef_unicef
4,0.0,action in a,0.0,artisan,31.130847,-1.825359,Shuleni,Lake Victoria,action in a,Kagera,...,seasonal,rainwater harvesting,surface,communal standpipe,functional,2011,7,2,action in a_artisan,action in a_action in a_artisan


In [3]:
df.shape

(47520, 36)

In [4]:

# Column groups
bool_cols = ['public_meeting', 'permit']
freq_encode_cols = ['funder', 'installer', 'subvillage', 'wpt_name', 'scheme_name']
label_encode_cols = list(set(df.select_dtypes(include='object').columns) - set(freq_encode_cols))
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.difference(bool_cols)

# Custom frequency encoder
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.freq_maps = {col: X[col].value_counts().to_dict() for col in X.columns}
        return self
    def transform(self, X):
        return X.apply(lambda col: col.map(self.freq_maps[col.name]).fillna(0))

# Custom label encoder
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.encoders = {col: LabelEncoder().fit(X[col].astype(str)) for col in X.columns}
        return self
    def transform(self, X):
        return X.apply(lambda col: self.encoders[col.name].transform(col.astype(str)))
    
# Specify which numerical columns to scale (excluding year_recorded and construction_year)
columns_to_scale = list(numerical_cols.difference(['year_recorded', 'construction_year']))
columns_to_passthrough = ['year_recorded', 'construction_year']

# Define the column transformer
# Updated column transformer
preprocessor = ColumnTransformer(transformers=[
    ('bools', FunctionTransformer(lambda x: x.astype(int)), bool_cols),
    ('freq', FrequencyEncoder(), freq_encode_cols),
    ('label', MultiColumnLabelEncoder(), label_encode_cols),
    ('scale', StandardScaler(), columns_to_scale),
    ('passthrough_years', 'passthrough', columns_to_passthrough)
])

# Wrap in pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
df_encoded_array = pipeline.fit_transform(df)

# Combine into DataFrame
processed_columns = (
    bool_cols +
    freq_encode_cols +
    label_encode_cols +
    columns_to_scale +
    columns_to_passthrough
)

df_encoded = pd.DataFrame(df_encoded_array, columns=processed_columns)


In [5]:
# Print the full pipeline steps
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('bools',
                                                  FunctionTransformer(func=<function <lambda> at 0x000002880DCC50D0>),
                                                  ['public_meeting', 'permit']),
                                                 ('freq', FrequencyEncoder(),
                                                  ['funder', 'installer',
                                                   'subvillage', 'wpt_name',
                                                   'scheme_name']),
                                                 ('label',
                                                  MultiColumnLabelEncoder(),
                                                  ['funder_installer',
                                                   'quality_group',
                                                   'source_class',
                                                   'status_...
      

In [6]:
from sklearn import set_config

# Display as diagram (in Jupyter or notebook environments)
set_config(display='diagram')  # Use 'text' if not using Jupyter

# This will render the pipeline visually if using Jupyter
pipeline

In [7]:
df_encoded.head()

Unnamed: 0,public_meeting,permit,funder,installer,subvillage,wpt_name,scheme_name,funder_installer,quality_group,source_class,...,dayofweek_recorded,district_code,gps_height,latitude,longitude,month_recorded,population,region_code,year_recorded,construction_year
0,1.0,0.0,217.0,72.0,217.0,2883.0,112.0,2138.0,2.0,0.0,...,-1.507557,-0.063252,1.041618,-1.409594,-0.075453,-0.453624,-0.148887,-0.244909,2011.0,1999.0
1,0.0,1.0,39.0,29.0,39.0,671.0,23488.0,843.0,2.0,1.0,...,-0.482809,-0.377167,1.054603,1.207517,-0.166424,-0.453624,0.211544,0.26494,2013.0,2010.0
2,1.0,1.0,7.0,573.0,7.0,2.0,8.0,1328.0,2.0,1.0,...,-1.507557,-0.16789,0.025921,0.63925,0.883415,-0.784064,0.148311,0.32159,2013.0,2009.0
3,1.0,1.0,860.0,190.0,860.0,1.0,23488.0,2694.0,2.0,0.0,...,-1.507557,6.005768,-0.584363,-1.850589,1.273221,-1.114504,-0.256385,4.230433,2013.0,1986.0
4,1.0,1.0,1.0,113.0,1.0,1400.0,23488.0,28.0,2.0,1.0,...,-0.482809,-0.481805,-0.963807,1.31687,-1.522644,0.868134,-0.378636,0.15164,2011.0,1996.0


In [8]:
df_encoded.to_csv("../data/processed/df_encoded.csv", index=False)


In [9]:
df_train = pd.read_csv("../data/processed/df_encoded.csv")


In [10]:
df_train.head()

Unnamed: 0,public_meeting,permit,funder,installer,subvillage,wpt_name,scheme_name,funder_installer,quality_group,source_class,...,dayofweek_recorded,district_code,gps_height,latitude,longitude,month_recorded,population,region_code,year_recorded,construction_year
0,1.0,0.0,217.0,72.0,217.0,2883.0,112.0,2138.0,2.0,0.0,...,-1.507557,-0.063252,1.041618,-1.409594,-0.075453,-0.453624,-0.148887,-0.244909,2011.0,1999.0
1,0.0,1.0,39.0,29.0,39.0,671.0,23488.0,843.0,2.0,1.0,...,-0.482809,-0.377167,1.054603,1.207517,-0.166424,-0.453624,0.211544,0.26494,2013.0,2010.0
2,1.0,1.0,7.0,573.0,7.0,2.0,8.0,1328.0,2.0,1.0,...,-1.507557,-0.16789,0.025921,0.63925,0.883415,-0.784064,0.148311,0.32159,2013.0,2009.0
3,1.0,1.0,860.0,190.0,860.0,1.0,23488.0,2694.0,2.0,0.0,...,-1.507557,6.005768,-0.584363,-1.850589,1.273221,-1.114504,-0.256385,4.230433,2013.0,1986.0
4,1.0,1.0,1.0,113.0,1.0,1400.0,23488.0,28.0,2.0,1.0,...,-0.482809,-0.481805,-0.963807,1.31687,-1.522644,0.868134,-0.378636,0.15164,2011.0,1996.0
