In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, recall_score, \
    accuracy_score, precision_score, f1_score

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
pd.set_option('display.max_columns', None)

In [None]:
LogisticRegression()

In [2]:
empty = np.zeros([5,5], dtype=float)
np.ones_like(empty), empty

(array([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]]),
 array([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]))

In [8]:
df1 = pd.read_csv('./Data/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv')
df2 = pd.read_csv('./Data/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv')
df3 = pd.read_csv('./Data/4910797b-ee55-40a7-8668-10efd5c1b960.csv')

# DF1 is the target value

In [35]:
df1['status_group'].value_counts(), df1.shape

(functional                 32259
 non functional             22824
 functional needs repair     4317
 Name: status_group, dtype: int64,
       id    status_group
 0  69572      functional
 1   8776      functional
 2  34310      functional
 3  67743  non functional
 4  19728      functional,
 (59400, 2))

In [37]:
df1.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


# DF2 is one part of the independent variables

In [33]:
df2.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


# DF 3 is another part of the independent variables

In [34]:
df3.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [19]:
df3.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [21]:
df2.columns

Index(['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
       'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
       'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
       'ward', 'population', 'public_meeting', 'recorded_by',
       'scheme_management', 'scheme_name', 'permit', 'construction_year',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_quality', 'quality_group', 'quantity', 'quantity_group',
       'source', 'source_type', 'source_class', 'waterpoint_type',
       'waterpoint_type_group'],
      dtype='object')

In [28]:
df2.shape, df3.shape

((14850, 40), (59400, 40))

In [23]:
df3['id'].duplicated().sum(), df2['id'].duplicated().sum(), 

(0, 0)

In [27]:
df2['id'].sort_values(), df3['id'].sort_values()

(3402        10
 14136       13
 5702        14
 11739       29
 785         32
          ...  
 10876    74241
 7696     74244
 6407     74245
 11433    74248
 5493     74249
 Name: id, Length: 14850, dtype: int64,
 9410         0
 18428        1
 12119        2
 10629        3
 2343         4
          ...  
 15137    74240
 8667     74242
 22584    74243
 108      74246
 39131    74247
 Name: id, Length: 59400, dtype: int64)

In [41]:
X_df = pd.concat([df2, df3], axis=0)

In [166]:
# X_df.head()

In [167]:
# This has additional info on independent variables for a hidden "y" set

X_df.shape

(74250, 40)

In [168]:
# X_df.info()

In [46]:
df1.shape, X_df.shape

((59400, 2), (74250, 40))

In [49]:
X_df[X_df['id'].duplicated()].shape

(0, 40)

In [50]:
X_df['id'] = X_df['id'].astype(int)

In [None]:
# Trying to

In [169]:
# useless merge because it includes additional hidden data

both = pd.merge(X_df, df1, on='id')

In [57]:
both.isna().sum()

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [61]:
both.shape, df1.shape, df2.shape[0] + df3.shape[0]

((59400, 41), (59400, 2), 74250)

# This is the working df object for some actual feature selection/cleaining

In [360]:
X_known = pd.merge(df3, df1, on='id')

In [361]:
X_known.shape

(59400, 41)

In [362]:
X_known.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,0.474141,15.297003,5.629747,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,12.23623,17.587406,9.633649,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,1.0,0.0,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,5.0,2.0,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,0.0,12.0,3.0,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,0.0,17.0,5.0,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,1776.0,99.0,80.0,30500.0,2013.0


In [363]:
# X_known['recorded_by'].value_counts(), X_known['scheme_management'].value_counts(), X_known['scheme_name'].value_counts(ascending=False)

In [69]:
X_known.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [171]:
# X_known.isna().sum()

In [170]:
# X_known.info()

In [365]:
X_known['subvillage'].value_counts()  # could maybe be included

Madukani       508
Shuleni        506
Majengo        502
Kati           373
Mtakuja        262
              ... 
Nkuukati         1
Mwigoe           1
Nachaomba        1
Kamanga B        1
Kiha Pachan      1
Name: subvillage, Length: 19287, dtype: int64

In [364]:
X_known['basin'].value_counts()  # should be included defintely

Lake Victoria              10248
Pangani                     8940
Rufiji                      7976
Internal                    7785
Lake Tanganyika             6432
Wami / Ruvu                 5987
Lake Nyasa                  5085
Ruvuma / Southern Coast     4493
Lake Rukwa                  2454
Name: basin, dtype: int64

# Parsed and selected distinct columns for categorical checks......

In [377]:
cat_cols = ["funder", 'installer', 'wpt_name', 'basin', 'region', 'region_code', "district_code", 'lga', 'ward', \
            'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_class', 'management', \
           'payment', 'water_quality', 'quality_group', 'quantity', 'source', "source_type", 'source_class',\
           'waterpoint_type']

I was not sure how to use this datetime object because it would require continual updating relative to the current date for accurate tracking information, especially because the construction date data is missing many values. Date recorded is also not very useful because it doesn't necessarily relate to the construction date at all.

In [367]:
# Changed date-recorded to a datetime object

X_known['date_time'] = pd.to_datetime(X_known['date_recorded'])

In [368]:
# X_known['date_time']

In [369]:
# X_known[X_known['population'] == 0]

In [136]:
# X_known[(X_known['construction_year'] != 0) & (X_known['population'] != 0)]

In [370]:
X_known[["construction_year", 'population']].describe()

Unnamed: 0,construction_year,population
count,59400.0,59400.0
mean,1300.652475,179.909983
std,951.620547,471.482176
min,0.0,0.0
25%,0.0,0.0
50%,1986.0,25.0
75%,2004.0,215.0
max,2013.0,30500.0


In [371]:
# needs median fill strat
X_known['construction_year'].value_counts()

# Needs mean fill strat
X_known['population']

X_known.head(1)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,date_time
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011-03-14


In [372]:
X_known.select_dtypes(np.number)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,69572,6000.0,1390,34.938093,-9.856322,0,11,5,109,1999
1,8776,0.0,1399,34.698766,-2.147466,0,20,2,280,2010
2,34310,25.0,686,37.460664,-3.821329,0,21,4,250,2009
3,67743,0.0,263,38.486161,-11.155298,0,90,63,58,1986
4,19728,0.0,0,31.130847,-1.825359,0,18,1,0,0
...,...,...,...,...,...,...,...,...,...,...
59395,60739,10.0,1210,37.169807,-3.253847,0,3,5,125,1999
59396,27263,4700.0,1212,35.249991,-9.070629,0,11,4,56,1996
59397,37057,0.0,0,34.017087,-8.750434,0,12,7,0,0
59398,31282,0.0,0,35.861315,-6.378573,0,1,4,0,0


# This is my selection of RELEVANT numerical columns, excludes:
### - id, num_private, region_code/district_code -> to categorical column, so from 10 features  to 6

In [373]:
num_cols = ['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'construction_year']

In [374]:
X_known.head(2)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,date_time
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional,2011-03-14
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional,2013-03-06


In [375]:
num_df = X_known[num_cols]
num_df.head()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year
0,69572,6000.0,1390,34.938093,-9.856322,109,1999
1,8776,0.0,1399,34.698766,-2.147466,280,2010
2,34310,25.0,686,37.460664,-3.821329,250,2009
3,67743,0.0,263,38.486161,-11.155298,58,1986
4,19728,0.0,0,31.130847,-1.825359,0,0


In [85]:
# X_known['extraction_type_group'].value_counts(), X_known['extraction_type_class'].value_counts()

In [146]:
# X_known['permit'].value_counts(), X_known['funder'].isna().sum(),\
# X_known['installer'].value_counts(), X_known['installer'].isna().sum()

In [147]:
# X_known.head(2)

# Categoricals

In [378]:
cat_df = X_known[cat_cols]
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   funder                 55765 non-null  object
 1   installer              55745 non-null  object
 2   wpt_name               59400 non-null  object
 3   basin                  59400 non-null  object
 4   region                 59400 non-null  object
 5   region_code            59400 non-null  int64 
 6   district_code          59400 non-null  int64 
 7   lga                    59400 non-null  object
 8   ward                   59400 non-null  object
 9   scheme_management      55523 non-null  object
 10  scheme_name            31234 non-null  object
 11  permit                 56344 non-null  object
 12  extraction_type        59400 non-null  object
 13  extraction_type_class  59400 non-null  object
 14  management             59400 non-null  object
 15  payment            

# converting numeric categorical columns to objects
### region_code and district_code

In [379]:
cat_df['region_code'] = cat_df['region_code'].astype(str)
cat_df['district_code'] = cat_df['district_code'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['region_code'] = cat_df['region_code'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_df['district_code'] = cat_df['district_code'].astype(str)


In [380]:
cat_df.isna().sum()

funder                    3635
installer                 3655
wpt_name                     0
basin                        0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
scheme_management         3877
scheme_name              28166
permit                    3056
extraction_type              0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quality_group                0
quantity                     0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
dtype: int64

### Doing exploration to indentify patterns in null data

In [122]:
cat_df[cat_df['permit'].isna()]

Unnamed: 0,funder,installer,wpt_name,region,region_code,district_code,lga,ward,scheme_management,scheme_name,permit,extraction_type,extraction_type_class,management,payment,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type
43,,,Mvae Primary,Singida,13,2,Singida Rural,Merya,VWC,K,,mono,motorpump,vwc,unknown,unknown,unknown,dry,machine dbh,borehole,groundwater,communal standpipe
47,,,Mahakamani,Mbeya,12,4,Rungwe,Kiwira,VWC,K,,gravity,gravity,vwc,never pay,soft,good,enough,spring,spring,groundwater,communal standpipe
65,,,Nyambi,Singida,13,2,Singida Rural,Naintiri,VWC,M,,mono,motorpump,vwc,unknown,unknown,unknown,dry,machine dbh,borehole,groundwater,communal standpipe
109,,,Kwa Mzee Kaiga,Mbeya,12,4,Rungwe,Lutebe,VWC,N,,other,other,vwc,never pay,soft,good,enough,river,river/lake,surface,communal standpipe
118,Hesawa,Hesawa,Tattabigo Shule Ya Msingi,Mara,20,2,Serengeti,Natta,Other,,,gravity,gravity,other,never pay,soft,good,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59339,,,Kwa Mzee Gwalugano,Mbeya,12,4,Rungwe,Mpuguso,VWC,K,,gravity,gravity,vwc,never pay,soft,good,insufficient,spring,spring,groundwater,communal standpipe
59344,,,Pentecoste Swidish,Arusha,2,7,Meru,Ngarenanyuki,,,,gravity,gravity,unknown,unknown,unknown,unknown,unknown,spring,spring,groundwater,communal standpipe
59357,,,Shabani,Singida,13,2,Singida Rural,Ntuntu,VWC,,,nira/tanira,handpump,vwc,unknown,unknown,unknown,dry,shallow well,shallow well,groundwater,hand pump
59366,,,Joshoni,Singida,13,2,Singida Rural,Puma,VWC,,,nira/tanira,handpump,vwc,never pay,soft,good,insufficient,shallow well,shallow well,groundwater,hand pump


# Preparing categorical data for the pipeline

###  Investigating low frequency names as candidates for binning

In [381]:
cat_df['funder'].value_counts(normalize=True, ascending=False) 
# cat_df['installer'].value_counts().shape

Government Of Tanzania    0.162898
Danida                    0.055841
Hesawa                    0.039487
Rwssp                     0.024639
World Bank                0.024191
                            ...   
Upendo Primary School     0.000018
Lgcgd                     0.000018
Villagers Mpi             0.000018
Nginila                   0.000018
Samlo                     0.000018
Name: funder, Length: 1897, dtype: float64

In [382]:
cat_df.isna().sum()

funder                    3635
installer                 3655
wpt_name                     0
basin                        0
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
scheme_management         3877
scheme_name              28166
permit                    3056
extraction_type              0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quality_group                0
quantity                     0
source                       0
source_type                  0
source_class                 0
waterpoint_type              0
dtype: int64

# AT THIS POINT I HAVE ISOLATED ALL OF THE USEFUL DATA EXCEPT THE DATE TIME

In [384]:
cat_df.shape

(59400, 23)

In [385]:
num_df.shape

(59400, 7)

# Final DF object of all features combined

In [377]:
cat_cols = ["funder", 'installer', 'wpt_name', 'basin', 'region', 'region_code', "district_code", 'lga', 'ward', \
            'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_class', 'management', \
           'payment', 'water_quality', 'quality_group', 'quantity', 'source', "source_type", 'source_class',\
           'waterpoint_type']

In [None]:
num_cols = ['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'population', 'construction_year']

# This is the beginning of real feature preparation for pipeline preprocessing

In [440]:
X = pd.concat([num_df, cat_df], axis=1)

In [441]:
X.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year
count,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0,59400.0
mean,37115.131768,317.650385,668.297239,34.077427,-5.706033,179.909983,1300.652475
std,21453.128371,2997.574558,693.11635,6.567432,2.946019,471.482176,951.620547
min,0.0,0.0,-90.0,0.0,-11.64944,0.0,0.0
25%,18519.75,0.0,0.0,33.090347,-8.540621,0.0,0.0
50%,37061.5,0.0,369.0,34.908743,-5.021597,25.0,1986.0
75%,55656.5,20.0,1319.25,37.178387,-3.326156,215.0,2004.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,30500.0,2013.0


### There are several columns with values of '0' that really mean that the value is null
- Need to modify: Convert 0's to NaNs 
- amount_tsh -> mean
- population -> mean
- construction_year -> mean/median, will need to investigate

### Replacing the 0 placeholder values with np.nan's

In [442]:
X['amount_tsh'].replace(0, np.nan, inplace=True)
X['population'].replace(0, np.nan, inplace=True)
X['construction_year'].replace(0, np.nan, inplace=True)

In [445]:
 X.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year
count,59400.0,17761.0,59400.0,57588.0,59400.0,38019.0,38691.0
mean,37115.131768,1062.351942,668.297239,35.149669,-5.706033,281.087167,1996.814686
std,21453.128371,5409.34494,693.11635,2.607428,2.946019,564.68766,12.472045
min,0.0,0.2,-90.0,29.607122,-11.64944,1.0,1960.0
25%,18519.75,50.0,0.0,33.2851,-8.540621,40.0,1987.0
50%,37061.5,250.0,369.0,35.005943,-5.021597,150.0,2000.0
75%,55656.5,1000.0,1319.25,37.233712,-3.326156,324.0,2008.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,30500.0,2013.0


### After replacing the zeros with nans, mean construction year is much more centally located, I also noticed that longitude of 0 doesnt make sense, so I am going to convert the 0's to np.nans in longitude

In [444]:
X['longitude'].replace(0, np.nan, inplace=True)

### gps_height would be a lot to parse, so for the time being, I will be leaving the gps_height values as is for now
###  I Also think 'mean' is a good strategy for filling nulls atm

In [446]:
X[X['gps_height'] != 0].shape

(38962, 30)

In [447]:
X.describe()

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year
count,59400.0,17761.0,59400.0,57588.0,59400.0,38019.0,38691.0
mean,37115.131768,1062.351942,668.297239,35.149669,-5.706033,281.087167,1996.814686
std,21453.128371,5409.34494,693.11635,2.607428,2.946019,564.68766,12.472045
min,0.0,0.2,-90.0,29.607122,-11.64944,1.0,1960.0
25%,18519.75,50.0,0.0,33.2851,-8.540621,40.0,1987.0
50%,37061.5,250.0,369.0,35.005943,-5.021597,150.0,2000.0
75%,55656.5,1000.0,1319.25,37.233712,-3.326156,324.0,2008.0
max,74247.0,350000.0,2770.0,40.345193,-2e-08,30500.0,2013.0


In [448]:
X.columns

Index(['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
       'construction_year', 'funder', 'installer', 'wpt_name', 'basin',
       'region', 'region_code', 'district_code', 'lga', 'ward',
       'scheme_management', 'scheme_name', 'permit', 'extraction_type',
       'extraction_type_class', 'management', 'payment', 'water_quality',
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type'],
      dtype='object')

# Handling placeholder/nulls/binning for categorical data!

### Categorical data preparation for pipelines

In [449]:
for i in X.select_dtypes(object).columns:
    print(i)

funder
installer
wpt_name
basin
region
region_code
district_code
lga
ward
scheme_management
scheme_name
permit
extraction_type
extraction_type_class
management
payment
water_quality
quality_group
quantity
source
source_type
source_class
waterpoint_type


# -Funder

In [450]:
len(X['funder'].unique())

1898

In [451]:
X['funder'].value_counts()[:100].sum()

44177

In [452]:
X['funder'].value_counts()[98:100]

Aict    85
Gtz     84
Name: funder, dtype: int64

### Given that the top 100 funders have accounted for the top 80% of wells, i feel ok about binning the bottom 20% into an 'other' bin

In [453]:
X['funder'].value_counts().quantile(.5)

1.0

In [454]:
X['funder'].value_counts()[X['funder'].value_counts() < 84]

Japan                    82
Cmsr                     81
Rc Ch                    80
Ndrdp                    78
Vwc                      78
                         ..
Upendo Primary School     1
Lgcgd                     1
Villagers Mpi             1
Nginila                   1
Samlo                     1
Name: funder, Length: 1797, dtype: int64

### CODE BLOCK FOR BINNING  - small values - still funder

In [455]:
# identifies the index values of entries with fewer than 84 value counts.

to_replace = X["funder"].value_counts()[X["funder"].value_counts() < 84].index.values

# investigating the values that were have isolated
to_replace, len(to_replace)

(array(['Japan', 'Cmsr', 'Rc Ch', ..., 'Villagers Mpi', 'Nginila', 'Samlo'],
       dtype=object),
 1797)

In [456]:
# creating a new dataframe for safety with binned 'funder' values
check =  X.replace(to_replace, value="other") 
check['funder'].value_counts()

other                     11588
Government Of Tanzania     9084
Danida                     3114
Hesawa                     2202
Rwssp                      1374
                          ...  
H                            86
Undp                         86
Mdrdp                        86
Aict                         85
Gtz                          84
Name: funder, Length: 101, dtype: int64

In [457]:
check['funder'].value_counts()

other                     11588
Government Of Tanzania     9084
Danida                     3114
Hesawa                     2202
Rwssp                      1374
                          ...  
H                            86
Undp                         86
Mdrdp                        86
Aict                         85
Gtz                          84
Name: funder, Length: 101, dtype: int64

# CHECK DF now has binned funders, next column for binning? 
# - installer
Top 10% of values have more than 31 installations, top 100 account for roughly 77% of all installations
top installer did over 31% of all installations, so is 23% acceptable for "other"?

In [460]:
check['installer'].value_counts()[:100].sum(), len(check['installer']), check.shape

(46028, 59400, (59400, 30))

In [461]:
check['installer'].value_counts()[98:100]

Oikos E .Africa    80
Adra               80
Name: installer, dtype: int64

Initially, my value_counts numbers weren't aligned but then i added the dropna=False argument, and they lined up.

In [462]:
(check['installer'].value_counts(dropna=False)[:100].sum()) / len(check)

0.8350673400673401

In [463]:
check['installer'].value_counts(normalize=True, dropna=False)[:100].sum()

0.8350673400673401

Again, for the sake of feature reduction, I will bin all values below the top 100 as "other"

# Creating new df with binned 'INSTALLERS'

In [464]:
# identifies the index values of entries with fewer than 84 value counts.

to_replace = check["installer"].value_counts()[check["installer"].value_counts() < 80].index.values

# creating a new dataframe for safety with binned 'funder' values
funder =  check.replace(to_replace, value="other").copy() 
funder['installer'].value_counts()

DWE                17402
other              15075
RWE                 1206
DANIDA              1050
KKKT                 898
                   ...  
MDRDP                 84
DA                    84
Water board           81
Adra                  80
Oikos E .Africa       80
Name: installer, Length: 100, dtype: int64

# New temp df funder
# Next category is wpt_name
- this seems to just be the literal name of the water well location, therefore, I don't see value in its inclusion

In [465]:
funder['wpt_name'].value_counts()

none                3563
other               2639
Shuleni             1748
Zahanati             830
Bombani              271
                    ... 
Kwa Iddi Bondeni       1
Kwa Kessy              1
Kwa Marijani           1
Kwa John Shirima       1
Kwa Mzee Chaira        1
Name: wpt_name, Length: 37106, dtype: int64

# Next category is region
- region, region_code both contain slightly different info, no nulls, no placeholders as far as I can see

In [466]:
funder['region'].value_counts()

other            6418
Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64

# Region_code

In [467]:
funder['region_code'].value_counts()

11    5300
17    5011
12    4639
3     4379
5     4040
18    3324
19    3047
2     3024
16    2816
10    2640
4     2513
1     2201
13    2093
14    1979
20    1969
15    1808
6     1609
21    1583
80    1238
60    1025
90     917
7      805
99     423
9      390
24     326
8      300
40       1
Name: region_code, dtype: int64

# district_code - has no nulls, no placeholders

In [468]:
funder['district_code'].value_counts()

1     12203
2     11173
3      9998
4      8999
5      4356
6      4074
7      3343
8      1043
30      995
33      874
53      745
43      505
13      391
23      293
63      195
62      109
60       63
0        23
80       12
67        6
Name: district_code, dtype: int64

# LGA - 125 disctinct values, correspond to "geographic locations"
- i am inclined to include all of these values as the location can be significant

In [469]:
funder['lga'].value_counts(normalize=False)

Njombe          2503
Arusha Rural    1252
Moshi Rural     1251
Bariadi         1177
Rungwe          1106
                ... 
Moshi Urban       79
Kigoma Urban      71
Arusha Urban      63
Lindi Urban       21
Nyamagana          1
Name: lga, Length: 125, dtype: int64

# Ward - no nulls - 2084 unique values, no values above .5% so very small

In [470]:
funder['ward'].value_counts(normalize=False, ascending=False)[198:200]

Mang'oto    61
Buchambi    61
Name: ward, dtype: int64

In [471]:
funder['ward'].value_counts().quantile(.9)

60.0

I feel ok about binning the bottom 10%, it will still mean the addition of 200 more columns

### BINNING wards

In [472]:
# identifies the index values of WARDS with fewer than 84 value counts.

to_replace = funder["ward"].value_counts()[funder["ward"].value_counts() < 60].index.values

# creating a new dataframe for safety with binned 'ward' values
wards =  funder.replace(to_replace, value="other").copy() 
wards['ward'].value_counts()

other         40102
Igosi           307
Imalinyi        252
Siha Kati       232
Mdandu          231
              ...  
Mshewa           60
Endabash         60
Sukuma           60
Mamba            60
Ngerengere       60
Name: ward, Length: 212, dtype: int64

# New WARD database

### scheme_management looks totally clean

In [473]:
wards['scheme_management'].value_counts()

VWC                 36793
other               11526
WUG                  5206
Private operator     1063
Other                 766
SWC                    97
Trust                  72
Name: scheme_management, dtype: int64

### scheme_name -> 2514 unique values... already has an "other" column, next largest 546

In [474]:
wards['scheme_name'].value_counts()

other                          4737
Borehole                        546
Chalinze wate                   405
DANIDA                          379
Ngana water supplied scheme     270
                               ... 
Loronu water supply               1
CSPD PROJECT                      1
BL Kyongwa                        1
Ugalla water supply               1
BL Nshere                         1
Name: scheme_name, Length: 2514, dtype: int64

In [475]:
wards['scheme_name'].value_counts()[98:100]

Chanjare water supply                    42
Kabingo/kiobela gravity  water supply    42
Name: scheme_name, dtype: int64

In [476]:
wards['scheme_name'].value_counts().quantile(.9)

23.0

### i will bin these smaller values as "dif_other" to preserve some separation - 267 columns added

In [477]:
# identifies the index values of SCHEME_NAME with fewer than 23 value counts.

to_replace = wards["scheme_name"].value_counts()[wards["scheme_name"].value_counts() < 23].index.values

# creating a new dataframe for safety with binned 'ward' values
scheme =  wards.replace(to_replace, value="dif_other").copy() 
scheme['scheme_name'].value_counts()

dif_other                            12291
other                                 4737
Borehole                               546
Chalinze wate                          405
DANIDA                                 379
                                     ...  
Janda                                   23
Sabodo Borehole Scheme                  23
kaleng                                  23
Tanzania flowers pipe line              23
Maji ya Chai gravity water supply       23
Name: scheme_name, Length: 267, dtype: int64

# SCHEME df object created

### permit info is very clean, True, False, and null

In [478]:
scheme['permit'].value_counts()

True     38852
False    17492
Name: permit, dtype: int64

### EXTRACTION TYPE - looks clean

In [479]:
scheme['extraction_type'].value_counts()

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64

### EXTRACTION TYPE CLASS - similar but different to EXTRACTION TYPE

In [480]:
scheme['extraction_type_class'].value_counts()

gravity         26780
handpump        16456
other            6430
submersible      6179
motorpump        2987
rope pump         451
wind-powered      117
Name: extraction_type_class, dtype: int64

I think that after consideration, I will probably drop extraction_type_class, since extraction_type contains the same info but at a more granular level, but might wait until subsequent models are made.

### MANAGEMENT - distinct and adequate short list, no nulls

In [481]:
scheme['management'].value_counts()

vwc                 40507
wug                  6515
other                4338
wua                  2535
private operator     1971
parastatal           1768
water authority       904
company               685
other - school         99
trust                  78
Name: management, dtype: int64

### Payment - clear and clean, no null

In [482]:
scheme['payment'].value_counts()

never pay                25348
other                     9211
pay per bucket            8985
pay monthly               8300
pay when scheme fails     3914
pay annually              3642
Name: payment, dtype: int64

# Water  quality - no probs

In [483]:
scheme['water_quality'].value_counts()

soft                  50818
salty                  4856
other                  1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64

# Quality_Group - same as above, so i will discard this one

In [484]:
scheme['quality_group'].value_counts()

good        50818
salty        5195
other        1876
milky         804
colored       490
fluoride      217
Name: quality_group, dtype: int64

# Quantity

In [485]:
scheme['quantity'].value_counts()

enough          33186
insufficient    15129
dry              6246
seasonal         4050
other             789
Name: quantity, dtype: int64

# Source - 

In [486]:
scheme['source'].value_counts()

spring                  17021
shallow well            16824
machine dbh             11075
river                    9612
rainwater harvesting     2295
hand dtw                  874
lake                      765
dam                       656
other                     278
Name: source, dtype: int64

# SOURCE_TYPE - will leave this out as it is the same as above w less granularity

In [487]:
scheme['source_type'].value_counts()

spring                  17021
shallow well            16824
borehole                11949
river/lake              10377
rainwater harvesting     2295
dam                       656
other                     278
Name: source_type, dtype: int64

# Source Class - looks fine

In [488]:
scheme['source_class'].value_counts()

groundwater    45794
surface        13328
other            278
Name: source_class, dtype: int64

# Waterpoint Type - looks good

In [489]:
scheme['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [490]:
scheme.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year,funder,installer,wpt_name,basin,region,region_code,district_code,lga,ward,scheme_management,scheme_name,permit,extraction_type,extraction_type_class,management,payment,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type
0,69572,6000.0,1390,34.938093,-9.856322,109.0,1999.0,Roman,Roman,none,Lake Nyasa,Iringa,11,5,other,other,VWC,Roman,False,gravity,gravity,vwc,pay annually,soft,good,enough,spring,spring,groundwater,communal standpipe
1,8776,,1399,34.698766,-2.147466,280.0,2010.0,other,other,dif_other,dif_other,Mara,20,2,Serengeti,Natta,Other,,True,gravity,gravity,wug,never pay,soft,good,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe


# CREATING FINAL SELECTED DATAFRAME

In [491]:
final_look = scheme.drop(columns=['source_type', 'quality_group', 'extraction_type_class', 'wpt_name'], axis=1)

In [492]:
final_look.shape

(59400, 26)

In [493]:
final_look.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 59400 non-null  int64  
 1   amount_tsh         17761 non-null  float64
 2   gps_height         59400 non-null  int64  
 3   longitude          57588 non-null  float64
 4   latitude           59400 non-null  float64
 5   population         38019 non-null  float64
 6   construction_year  38691 non-null  float64
 7   funder             55765 non-null  object 
 8   installer          55745 non-null  object 
 9   basin              59400 non-null  object 
 10  region             59400 non-null  object 
 11  region_code        59400 non-null  object 
 12  district_code      59400 non-null  object 
 13  lga                59400 non-null  object 
 14  ward               59400 non-null  object 
 15  scheme_management  55523 non-null  object 
 16  scheme_name        312

In [428]:
# scheme.info()

In [494]:
final_look.shape

(59400, 26)

In [495]:
final_look.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year,funder,installer,basin,region,region_code,district_code,lga,ward,scheme_management,scheme_name,permit,extraction_type,management,payment,water_quality,quantity,source,source_class,waterpoint_type
0,69572,6000.0,1390,34.938093,-9.856322,109.0,1999.0,Roman,Roman,Lake Nyasa,Iringa,11,5,other,other,VWC,Roman,False,gravity,vwc,pay annually,soft,enough,spring,groundwater,communal standpipe
1,8776,,1399,34.698766,-2.147466,280.0,2010.0,other,other,dif_other,Mara,20,2,Serengeti,Natta,Other,,True,gravity,wug,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe


# MERGING FEATURES WITH TARGET INTO ONE FINAL DATAFRAME OBJECT!!!!

In [497]:
feature_df = pd.merge(final_look, df1, on='id')
feature_df.shape

(59400, 27)

In [498]:
feature_df.head(2)

Unnamed: 0,id,amount_tsh,gps_height,longitude,latitude,population,construction_year,funder,installer,basin,region,region_code,district_code,lga,ward,scheme_management,scheme_name,permit,extraction_type,management,payment,water_quality,quantity,source,source_class,waterpoint_type,status_group
0,69572,6000.0,1390,34.938093,-9.856322,109.0,1999.0,Roman,Roman,Lake Nyasa,Iringa,11,5,other,other,VWC,Roman,False,gravity,vwc,pay annually,soft,enough,spring,groundwater,communal standpipe,functional
1,8776,,1399,34.698766,-2.147466,280.0,2010.0,other,other,dif_other,Mara,20,2,Serengeti,Natta,Other,,True,gravity,wug,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,functional


### The final features have been selected and prepared for use in sklearn pipelines, and I will save this version as a new file

In [500]:
feature_df.to_csv('./Data/logistic_model_features.csv')

In [501]:
feature_df.to_csv('/Users/samalainabayeva/Desktop/Water Project CSVs/logistic_model_features.csv')

I will verify that a new file has been created, and then begin a new jupyter notebook for the modeling of the data

In [None]:
column_transformer = ColumnTransformer(
    transformers=[
        ('imputer1', SimpleImputer(strategy='mean'), ['column1']),
        ('imputer2', SimpleImputer(strategy='median'), ['column2']),
    ],
    remainder='passthrough'  # Optional: specify how to handle remaining columns
)

# Create the pipeline with the column transformer and additional steps
pipeline = Pipeline([
    ('preprocessor', column_transformer),
    ('scaler', StandardScaler()),
    # Add more steps as needed

In [None]:
subpipe_numerics = Pipeline(steps=[
    ('mean_impute', SimpleImputer(add_indicator=True, missing_values=strategy='mean', ['population'])),
    ('median_impute', SimpleImputer(add_indicator=True, strategy='mean', ['construction_date']))
    #('poly', PolynomialFeatures()),
    ('ss', StandardScaler()),
])

# strings are names of the steps, have to have a name for the step, can be called anything

sub_pipe_cat = Pipeline(steps=[
    
    ('cat_impute', SimpleImputer(strategy='constant', fill_value="Missing", add_indicator=True)),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
], verbose=False)


# Verbose set to True is good for debugging, shows steps if they break, BUT!!!!
# It outputs a LOT more text, so probably will leave it False for now.....

In [None]:
CT = ColumnTransformer(transformers=[
    ('subpipe_numerics', subpipe_numerics, selector(dtype_include=np.number)),
    ('subpipe_cat', sub_pipe_cat, selector(dtype_include=object))
], remainder='passthrough')


# if we had DIFFERENT DATA TYPES, we need to set remainder argument
# np.number is a catchall for ANYTHING numeric
# takes a bunch of transformers and does the transformation

# args are: name, transformer, 