# feature engineering

In [1]:
# imports
import pandas as pd
import numpy as np

In [5]:
X_train = pd.read_csv('data/Training_set_values.csv', index_col='id')
X_test = pd.read_csv('data/Test_set_values.csv', index_col='id')
y_train = pd.read_csv('data/Training_set_labels.csv', index_col='id')

In [6]:
df = X_train.merge(y_train, on='id')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   date_recorded          59400 non-null  object 
 2   funder                 55765 non-null  object 
 3   gps_height             59400 non-null  int64  
 4   installer              55745 non-null  object 
 5   longitude              59400 non-null  float64
 6   latitude               59400 non-null  float64
 7   wpt_name               59400 non-null  object 
 8   num_private            59400 non-null  int64  
 9   basin                  59400 non-null  object 
 10  subvillage             59029 non-null  object 
 11  region                 59400 non-null  object 
 12  region_code            59400 non-null  int64  
 13  district_code          59400 non-null  int64  
 14  lga                    59400 non-null  object 
 15

### converting some categorical features represented as strings from our initial dataset into ternary values to be able to build additional features. 

In [8]:
tern_water_quality = {'soft':0,
                      'salty':1,
                      'unknown': 2,
                      'milky': 3,
                      'coloured':4,
                      'salty abandoned': 5, 
                      'fluoride': 6,
                      'fluoride abandoned': 7
}

tern_region = {'Iringa':0,
               'Shinyanga':1,
               'Mbeya':2,
               'Kilimanjaro':3,
               'Morogoro':4,
               'Arusha':5,
               'Kagera':6,
               'Mwanza':7,
               'Kigoma':8,
               'Ruvuma':9,
               'Pwani':10,
               'Tanga':11,
               'Dodoma':12,
               'Singida':13,
               'Mara':14,
               'Tabora':15,
               'Rukwa':16,
               'Mtwara':17,
               'Manyara':18,
               'Lindi':19,
               'Dar es Salaam':20                                     
}

tern_extraction_type = {'gravity':0,
                        'nira/tanira':1,
                        'other':2,
                        'submersible':3,
                        'swn 80':4,
                        'mono':5,
                        'india mark ii':6,
                        'afridev':7,
                        'ksb':8,
                        'other - rope pump':9,
                        'other - swn 81':10,
                        'windmill':11,
                        'india mark iii':12,
                        'cemo':13,
                        'other - play pump':14,
                        'walimi':15,
                        'climax':16,
                        'other - mkulima/shinyanga':17
}

tern_waterpoint_type = {'communal standpipe':0,
'hand pump':1,
'other':2,
'communal standpipe multiple':3,
'improved spring':4,
'cattle trough':5,
'dam':6
}

tern_source = { 'spring':0,
                'shallow well':1,
                'machine dbh':2,
                'river':3,
                'rainwater harvesting':4,
                'hand dtw':5,
                'lake':6,
                'dam':7,
                'other':8,
                'unknown':9
}

tern_scheme_management = {'VWC':0,
                          'WUG':1,
                          'Water authority':2,
                          'WUA':3,
                          'Water Board':4,
                          'Parastatal':5,
                          'Private operator':6,
                          'Company':7,
                          'Other':8,
                          'SWC':9,
                          'Trust':10,
                          'None':11
}

### creating some additional features as bivariate combinations which could be impactful for our modelling process

In [9]:
df.water_quality = [tern_water_quality[x] for x in df.water_quality]

In [10]:
df.region = [tern_region[x] for x in df.region]

In [11]:
df.extraction_type = [tern_extraction_type[x] for x in df.extraction_type]

In [12]:
df.waterpoint_type = [tern_waterpoint_type[x] for x in df.waterpoint_type]

In [13]:
df.source = [tern_source[x] for x in df.source]

### adding these new features to our dataframe

In [14]:
df['water_quality_by_region'] = df.water_quality * df.region
df['water_quality_by_extraction_type'] = df.water_quality * df.extraction_type
df['water_quality_by_waterpoint_type'] = df.water_quality * df.waterpoint_type
df['water_quality_by_source'] = df.water_quality * df.source
df['water_quality_by_gps_height'] = df.water_quality * df.gps_height

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 45 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   amount_tsh                        59400 non-null  float64
 1   date_recorded                     59400 non-null  object 
 2   funder                            55765 non-null  object 
 3   gps_height                        59400 non-null  int64  
 4   installer                         55745 non-null  object 
 5   longitude                         59400 non-null  float64
 6   latitude                          59400 non-null  float64
 7   wpt_name                          59400 non-null  object 
 8   num_private                       59400 non-null  int64  
 9   basin                             59400 non-null  object 
 10  subvillage                        59029 non-null  object 
 11  region                            59400 non-null  int64  
 12  

In [16]:
cols_to_drop = ['amount_tsh',
                'scheme_name', 
                'date_recorded', 
                'wpt_name', 
                'subvillage', 
                'lga', 
                'ward', 
                'recorded_by', 
                'quantity_group',
                'quality_group',
                'payment_type', 
                'funder', 
                'extraction_type_group', 
                'extraction_type_class',
                'management_group',
                'source_type',
                'source_class',
                'waterpoint_type_group',
                'water_quality',
                'region',
                'extraction_type',
                'waterpoint_type',
                'source',
                'gps_height'
]
df = df.drop(cols_to_drop, axis = 1)
df = df.fillna('missing', axis = 1)

In [17]:
X_train.to_csv('data/X_train_eng.csv')
X_test.to_csv('data/X_test_eng.csv')
y_train.to_csv('data/y_train_eng.csv')
y_test.to_csv('data/y_test_eng.csv')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   installer                         59400 non-null  object 
 1   longitude                         59400 non-null  float64
 2   latitude                          59400 non-null  float64
 3   num_private                       59400 non-null  int64  
 4   basin                             59400 non-null  object 
 5   region_code                       59400 non-null  int64  
 6   district_code                     59400 non-null  int64  
 7   population                        59400 non-null  int64  
 8   public_meeting                    59400 non-null  object 
 9   scheme_management                 59400 non-null  object 
 10  permit                            59400 non-null  object 
 11  construction_year                 59400 non-null  int64  
 12  