# Pump It Up Challenge - Optimisation

In [82]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import datetime as dt
from scripts import pumpitup

from scipy import stats

from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

sns.set_style("white")
sns.set_context("talk")

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.options.display.max_columns = 50

# Training Data

## Data

In [5]:
train_data = pd.read_csv('data/training_set.csv')

## Labels

In [6]:
train_labels = pd.read_csv('data/training_labels.csv')

In [7]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


## Cleaning

In [8]:
train_data, modifiers = pumpitup.cleanitup(train_data)

In [9]:
le = preprocessing.LabelEncoder()

columns = train_data.columns
bin_lengths = {}

for column in columns:
    series = train_data[column]
    if series.dtype == 'object':
        bin_lengths[column] = pumpitup.binary_count(series)
        encoded = pumpitup.binary_encode(series, le)[1]
        train_data = pd.concat([train_data, encoded], axis=1)
        train_data.drop(column, axis=1, inplace=True)

In [10]:
columns = train_data.columns
bin_lengths = {}

for column in columns:
    series = train_data[column]
    if series.dtype == 'object':
        #bin_lengths[column] = pumpitup.binary_count(series)
        encoded = le.fit_transform(series)
        #train_data = pd.concat([train_data, encoded], axis=1)
        train_data[column] = encoded

In [11]:
train_data.head()

Unnamed: 0,funder,gps_height,installer,longitude,latitude,wpt_name,basin,region,lga,population,public_meeting,permit,construction_year,extraction_type,extraction_type_class,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,year_recorded,month_recorded,operation_years
0,19,1390,15,34.938093,-9.856322,4,1,3,0,109,1,0,1999,1,0,7,4,0,6,1,7,0,1,2011,3,12
1,19,1399,15,34.698766,-2.147466,1,4,9,0,280,1,1,2010,1,0,11,4,2,6,2,4,1,1,2013,3,3
2,19,686,15,37.460664,-3.821329,2,5,8,0,250,1,1,2009,1,0,7,4,5,6,1,0,1,2,2013,2,4
3,15,263,15,38.486161,-11.155298,1,7,12,0,58,1,1,1986,11,5,7,4,2,6,0,3,0,2,2013,1,27
4,19,1328,15,31.130847,-1.825359,6,4,4,0,316,1,1,2003,1,0,1,1,2,6,3,4,1,1,2011,7,8


In [12]:
train_labels = pd.read_csv('data/training_labels.csv')

In [13]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [57]:
labels = train_labels['status_group']

# Optimisation Function

First, we're going to write a helper function to run the optimisation process for us. This needs to pick the best combination of parameters and return information about all of the cross validation attempts.

First, we can try it out with some toy parameters.

We'll try two different max features to use at each split.

In [76]:
max_features_opt = [7, 8]

We'll leave the maximum tree depth unrestricted.

In [73]:
max_depth_opt = [None]

And we'll pick one value for the minimum number of samples required to make a split.

In [78]:
min_samples_split_opt = [8]

In [65]:
def optimise_forest(X, y, max_features_opt, max_depth_opt, min_samples_split_opt, n_estimators=100):
    
    parameters_opt = [(a, b, c) for a in max_features_opt for b in max_depth_opt for c in min_samples_split_opt]
    
    best_rate = 0
    rates = {}
    
    for parameters in parameters_opt:      
        max_features, max_depth, min_samples_split = parameters

        predictions, clf = pumpitup.run_forest(X, y, n_estimators=n_estimators, max_features=max_features,\
                                               max_depth=max_depth, min_samples_split=min_samples_split)

        rate_new = pumpitup.classification_rate(train_labels['status_group'], predictions)

        rates[parameters] = rate_new

        if rate_new > best_rate:
            best_rate = rate_new
            best_parameters = parameters
            best_predictions = predictions
            best_clf = clf

    return best_predictions, best_clf, best_parameters, best_rate, rates

In [79]:
optimise_forest(train_data, train_labels, max_features_opt, max_depth_opt, min_samples_split_opt, n_estimators=10)

(array(['functional', 'functional', 'functional', ..., 'functional',
        'functional', 'functional'], dtype=object),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features=7, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=8,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=3,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 (7, None, 8),
 0.80136363636363639,
 {(7, None, 8): 0.80136363636363639, (8, None, 8): 0.80069023569023567})

Seems to work pretty good!

## Optimisation Parameters

We're going to try setting the maximum number of features used at each split from 4 (just below the square root) to 11 (around 40%).

In [31]:
max_features_opt = [5, 6, 7, 8, 9, 10, 11]

We'll try different depths from 1 (unrealistic) up to None (the maximum possible). This may prevent overfitting.

In [81]:
max_depth_opt = [None]

Finally, we will try some different split requirements.

In [80]:
min_samples_split_opt = [2, 4, 8, 16, 32]

We're also going to try with and without SVD, and with and without weighting. In total we'll have over a thousand combinations to try.

**I ran this optimisation on an virtual machine instance on Microsoft Azure**