In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

### Load Data

In [9]:
df = pd.read_csv("project/data/train/features.csv")
labels = pd.read_csv("project/data/train/labels.csv")
df = pd.merge(labels, df, on='id')
pd.set_option('display.max_columns', None)

print("Numerical columns: ", df.select_dtypes(include=np.number).columns.tolist())

Numerical columns:  ['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']


In [10]:
df.head(5)

Unnamed: 0,id,status_group,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


### Columns to Drop
amount_tsh: *70% missing values*



### Construction Year Imputation

35% of construction years are missing. https://github.com/NicolasGrimault/Project-IA replaces each empty row by a random value between -σ and σ (sigma for the standard deviation). This is implemented below.

In [11]:
df['construction_year'].replace(0, np.nan, inplace=True)

mean = df['construction_year'].mean(skipna=True)
std = df['construction_year'].std(skipna=True)
print(mean, std)

rand_values = np.random.normal(mean, std, df['construction_year'].isnull().sum())

df['construction_year'][df['construction_year'].isnull()] = rand_values

print(df["construction_year"])
print(df['construction_year'].isnull().sum())

1996.8146855857951 12.472045035085223
0        1999.000000
1        2010.000000
2        2009.000000
3        1986.000000
4        1977.498105
            ...     
59395    1999.000000
59396    1996.000000
59397    1996.052143
59398    1978.692236
59399    2002.000000
Name: construction_year, Length: 59400, dtype: float64
0


### GPS Height Imputation

Replace values randomly within 1std of the mean, following a normal distribution.

This probably needs verifying/improving but it'll do for now.

In [12]:
df['gps_height'].replace(0, np.nan, inplace=True)

mean = df['gps_height'].mean(skipna=True)
std = df['gps_height'].std(skipna=True)
print(mean, std)

rand_values = np.random.normal(mean, std, df['gps_height'].isnull().sum())

df['gps_height'][df['gps_height'].isnull()] = rand_values

print(df["gps_height"])
print(df['gps_height'].isnull().sum())

1018.8608387659771 612.5660915658171
0        1390.000000
1        1399.000000
2         686.000000
3         263.000000
4        1559.438315
            ...     
59395    1210.000000
59396    1212.000000
59397     108.740606
59398    2979.894777
59399     191.000000
Name: gps_height, Length: 59400, dtype: float64
0


### Latitude and Longitude Imputation
0 values are replaced by subvillage, ward, lga, or region's mean logitude and latitude values for each column. 

Taken from https://github.com/BrendaLoznik/waterpumps/blob/main/2.%20Data%20cleaning%20%26%20Feature%20engineering.ipynb

In [13]:
df['longitude'].replace(0, np.nan, inplace=True)
#create mean longitude on the lowest granularity level (subvillage)
means_longitude_subvillage = df.groupby(['region', 'lga', 'ward', 'subvillage'])['longitude'].mean().reset_index()
means_longitude_subvillage = means_longitude_subvillage.rename(columns={"longitude": "longitude_imputed_subvillage"})

#ward level
means_longitude_ward = df.groupby(['region', 'lga', 'ward',])['longitude'].mean().reset_index()
means_longitude_ward = means_longitude_ward.rename(columns={"longitude": "longitude_imputed_ward"})

#lga level
means_longitude_lga = df.groupby(['region', 'lga'])['longitude'].mean().reset_index()
means_longitude_lga = means_longitude_lga .rename(columns={"longitude": "longitude_imputed_lga"})

#region level
means_longitude_region = df.groupby(['region'])['longitude'].mean().reset_index()
means_longitude_region = means_longitude_region.rename(columns={"longitude": "longitude_imputed_region"})
means_longitude_region.head()

Unnamed: 0,region,longitude_imputed_region
0,Arusha,36.552713
1,Dar es Salaam,39.215799
2,Dodoma,36.044171
3,Iringa,34.895989
4,Kagera,31.233262


In [14]:
#merge the aggregated dataframes as new columns to the original df
raw= df.merge(means_altitude_subvillage, how = 'left', on = ['region', 'lga', 'ward', 'subvillage'])
raw = raw.merge(means_altitude_ward, how = 'left', on = ['region', 'lga', 'ward'])
raw = raw.merge(means_altitude_lga, how = 'left', on = ['region', 'lga'])
raw = raw.merge(means_altitude_region, how = 'left', on = ['region'])
raw = raw.merge(means_altitude_basin, how = 'left', on = ['basin'])

#create final imputed longitude column
raw['imputed_gps_height'] = np.where(raw['gps_height'].isna(), raw['gps_height_imputed_subvillage'], raw['gps_height']) #if longitude is missing, impute it by the mean of the subvillage
raw['imputed_gps_height'] = np.where(raw['imputed_gps_height'].isna(), raw['gps_height_imputed_ward'], raw['imputed_gps_height']) #if subvillage mean is missing, impute it by the ward
raw['imputed_gps_height'] = np.where(raw['imputed_gps_height'].isna(), raw['gps_height_imputed_lga'], raw['imputed_gps_height'])
raw['imputed_gps_height'] = np.where(raw['imputed_gps_height'].isna(), raw['gps_height_imputed_region'], raw['imputed_gps_height'])
raw['imputed_gps_height'] = np.where(raw['imputed_gps_height'].isna(), raw['gps_height_imputed_basin'], raw['imputed_gps_height'])

NameError: name 'means_altitude_subvillage' is not defined

In [None]:
df['latitude'].where(df["latitude"] <= -0.5, np.nan, inplace=True)
#create mean latitude on the lowest granularity level (subvillage)
means_latitude_subvillage = df.groupby(['region', 'lga', 'ward', 'subvillage'])['latitude'].mean().reset_index()
means_latitude_subvillage = means_latitude_subvillage.rename(columns={"latitude": "latitude_imputed_subvillage"})

#ward level
means_latitude_ward = df.groupby(['region', 'lga', 'ward',])['latitude'].mean().reset_index()
means_latitude_ward = means_latitude_ward.rename(columns={"latitude": "latitude_imputed_ward"})

#lga level
means_latitude_lga = df.groupby(['region', 'lga'])['latitude'].mean().reset_index()
means_latitude_lga = means_latitude_lga .rename(columns={"latitude": "latitude_imputed_lga"})

#region level
means_latitude_region = df.groupby(['region'])['latitude'].mean().reset_index()
means_latitude_region = means_latitude_region.rename(columns={"latitude": "latitude_imputed_region"})
means_latitude_region.head()

Unnamed: 0,region,latitude_imputed_region
0,Arusha,-3.246455
1,Dar es Salaam,-6.909677
2,Dodoma,-5.928734
3,Iringa,-8.9077
4,Kagera,-1.961466
