DS Build Week Project

*Tanzania Waterpumps prediction*

---

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
DATA_PATH = '../notebooks/waterpumps/'
# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv(DATA_PATH+'train_features.csv'), 
                 pd.read_csv(DATA_PATH+'train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv(DATA_PATH+'test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'sample_submission.csv')


# Split train into train & val
train, val = train_test_split(train, train_size=0.80, test_size=0.20, 
                              stratify=train['status_group'], random_state=42)


def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""
    
    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace the zeros with nulls, and impute missing values later.
    # Also create a "missing indicator" column, because the fact that
    # values are missing may be a predictive signal.
    cols_with_zeros = ['longitude', 'latitude', 'construction_year', 
                       'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col+'_MISSING'] = X[col].isnull()
            
    # Drop duplicate columns
    duplicates = ['quantity_group', 'payment_type']
    X = X.drop(columns=duplicates)
    
    # Drop recorded_by (never varies) and id (always varies, random)
    unusable_variance = ['recorded_by', 'id']
    X = X.drop(columns=unusable_variance)
    
    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract components from date_recorded, then drop the original column
    X['year_recorded'] = X['date_recorded'].dt.year
    X['month_recorded'] = X['date_recorded'].dt.month
    X['day_recorded'] = X['date_recorded'].dt.day
    X = X.drop(columns='date_recorded')
    
    # Engineer feature: how many years from construction_year to date_recorded
    X['years'] = X['year_recorded'] - X['construction_year']
    X['years_MISSING'] = X['years'].isnull()
    
    # return the wrangled dataframe
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [5]:
# Arrange data into X features matrix and y target vector
target = 'status_group'
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [6]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline.fit(X_train, y_train)
print('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8135521885521886


### Use permutation importance to find the top 5 most important features

In [7]:
from sklearn.inspection import permutation_importance
r = permutation_importance(pipeline, X_val, y_val,
                         random_state=42)

In [8]:
df_features = pd.DataFrame({'columns':X_val.columns, 'feature_importance':r.importances_mean}).sort_values(by='feature_importance', ascending=False)

In [9]:
df_features_filtered = df_features[:5]

In [10]:
feature_list = df_features_filtered['columns'].tolist()

In [11]:
feature_list 

['quantity',
 'amount_tsh',
 'waterpoint_type',
 'extraction_type_class',
 'longitude']

### Use these 5 new features to train the model again!

In [10]:
X_train_feature_filtered = X_train[feature_list]
X_val_feature_filtered = X_val[feature_list]
pipeline_filtered = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(random_state=42, n_jobs=-1)
)

# Fit on train, score on val
pipeline_filtered.fit(X_train_feature_filtered, y_train)
print('Validation Accuracy', pipeline_filtered.score(X_val_feature_filtered, y_val))

Validation Accuracy 0.7134680134680135


### Save your model in .joblib

In [11]:
from joblib import dump
dump(pipeline_filtered, 'pipeline.joblib', compress=True)

['pipeline.joblib']

###  check the description of the continuous variables

In [13]:
con_cols =  X_train_feature_filtered.select_dtypes(exclude='object').columns.tolist()

In [14]:
for con in con_cols:
    print( X_train_feature_filtered[con].describe())

count     47520.000000
mean        321.925261
std        3197.240487
min           0.000000
25%           0.000000
50%           0.000000
75%          25.000000
max      350000.000000
Name: amount_tsh, dtype: float64
count    46078.000000
mean        35.149033
std          2.604241
min         29.607122
25%         33.284679
50%         35.008578
75%         37.223501
max         40.344301
Name: longitude, dtype: float64


###  check the unique values in each of the continous variables

In [15]:
cat_cols = X_train_feature_filtered.select_dtypes(include='object').columns

In [16]:
for cat in cat_cols:
    print(cat)
    print(X_train_feature_filtered[cat].unique().tolist())

quantity
['insufficient', 'enough', 'dry', 'seasonal', 'unknown']
waterpoint_type
['communal standpipe', 'hand pump', 'other', 'communal standpipe multiple', 'improved spring', 'cattle trough', 'dam']
extraction_type_class
['gravity', 'handpump', 'other', 'motorpump', 'submersible', 'rope pump', 'wind-powered']
