# Pump It Up Challenge - Predictions

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import datetime as dt
from scripts import pumpitup

from scipy import stats

from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

sns.set_style("white")
sns.set_context("talk")

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.options.display.max_columns = 50

# Training Data

## Data

In [5]:
train_data = pd.read_csv('data/training_set.csv')

## Labels

In [6]:
train_labels = pd.read_csv('data/training_labels.csv')

In [7]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


## Cleaning

In [8]:
train_data, modifiers = pumpitup.cleanitup(train_data)

In [9]:
le = preprocessing.LabelEncoder()

columns = train_data.columns
bin_lengths = {}

for column in columns:
    series = train_data[column]
    if series.dtype == 'object':
        bin_lengths[column] = pumpitup.binary_count(series)
        encoded = pumpitup.binary_encode(series, le)[1]
        train_data = pd.concat([train_data, encoded], axis=1)
        train_data.drop(column, axis=1, inplace=True)

In [10]:
columns = train_data.columns
bin_lengths = {}

for column in columns:
    series = train_data[column]
    if series.dtype == 'object':
        #bin_lengths[column] = pumpitup.binary_count(series)
        encoded = le.fit_transform(series)
        #train_data = pd.concat([train_data, encoded], axis=1)
        train_data[column] = encoded

In [11]:
train_data.head()

Unnamed: 0,funder,gps_height,installer,longitude,latitude,wpt_name,basin,region,lga,population,public_meeting,permit,construction_year,extraction_type,extraction_type_class,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,year_recorded,month_recorded,operation_years
0,19,1390,15,34.938093,-9.856322,4,1,3,0,109,1,0,1999,1,0,7,4,0,6,1,7,0,1,2011,3,12
1,19,1399,15,34.698766,-2.147466,1,4,9,0,280,1,1,2010,1,0,11,4,2,6,2,4,1,1,2013,3,3
2,19,686,15,37.460664,-3.821329,2,5,8,0,250,1,1,2009,1,0,7,4,5,6,1,0,1,2,2013,2,4
3,15,263,15,38.486161,-11.155298,1,7,12,0,58,1,1,1986,11,5,7,4,2,6,0,3,0,2,2013,1,27
4,19,1328,15,31.130847,-1.825359,6,4,4,0,316,1,1,2003,1,0,1,1,2,6,3,4,1,1,2011,7,8


In [12]:
train_labels = pd.read_csv('data/training_labels.csv')

In [13]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [28]:
predictions, clf = pumpitup.run_forest(train_data, train_labels)

In [29]:
pumpitup.classification_rate(train_labels['status_group'], predictions)

0.78863636363636369

Train actual classifier:

In [30]:
clf = RandomForestClassifier(n_estimators=3000, n_jobs=3, max_features=7,\
                             class_weight="balanced", min_samples_split=8,\
                            max_depth=500)

In [31]:
clf.fit(train_data, train_labels['status_group'])

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=500, max_features=7,
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=3000, n_jobs=3,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# Unoptimised Prediction Time

In [32]:
test_data = pd.read_csv('data/test_set.csv')

In [33]:
test_data = pumpitup.cleantestup(test_data, modifiers)

In [34]:
columns = test_data.columns
bin_lengths = {}

for column in columns:
    series = test_data[column]
    if series.dtype == 'object':
        #bin_lengths[column] = pumpitup.binary_count(series)
        encoded = le.fit_transform(series)
        #train_data = pd.concat([train_data, encoded], axis=1)
        test_data[column] = encoded

test_data.drop('year_recorded', axis=1, inplace=True)

In [35]:
submission_predictions = clf.predict(test_data)

In [36]:
submission = pd.read_csv('data/SubmissionFormat.csv')

In [37]:
submission.status_group = submission_predictions

In [38]:
submission.to_csv('submissions/submission15.csv', index=False)

Trying a few things out, the best rate I acheieved was around.