In [78]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
import calendar
%matplotlib inline
sns.set_style("white", {'axes.grid' : True})
sns.set_palette("tab10", 14)
sns.set_context('talk')

In [79]:
# Reading in the two csv files
traindf=pd.read_csv('train_revised_1.csv')
testdf=pd.read_csv('test_questions_1.csv')

In [80]:
# creating a 'number_of_ticket' column in the training file by counting number of tickets for each ride_id and grouping
traindf['rides'] = traindf['ride_id']
traindf['rides'] = np.where(traindf['rides'].notnull(), 1, np.NaN)
traindf['number_of_ticket'] = traindf.groupby(['ride_id', 'rides'])['ride_id'].transform('count')
traindf.drop('rides', axis=1,inplace=True)
# removing duplicates where each journey has multiple tickets
traindf = traindf.drop_duplicates()

In [81]:
# Combining train and test dataframe to ensure identical operations while preprocessing
data = pd.concat([traindf,testdf], sort=False).reset_index(drop=True)
data.shape

(7360, 7)

In [82]:
# converting travel date as datetime
data.travel_date = pd.to_datetime(data.travel_date, dayfirst=True)
# converting travel dates into days of the week
data['day'] = data.travel_date.dt.weekday
data['day'] = data['day'].map({0:'Mon', 1:'Tue', 2:'Wed', 3:'Thur', 4:'Fri', 5:'Sat', 6:'Sun'})
# converting travel time to hours (decimal)
data["travel_time"] = data["travel_time"].str.split(':').apply(lambda x: int(x[0]) + int(x[1])/60)

In [83]:
data.dtypes

ride_id                      int64
travel_date         datetime64[ns]
travel_time                float64
travel_from                 object
car_type                    object
max_capacity                 int64
number_of_ticket           float64
day                         object
dtype: object

In [85]:
# Creating blank columns in data
header_list = ['daily_mean','daily_min','daily_max', 'tf_mean', 'tf_min', 'tf_max']
for header in header_list:
    data[header] = np.nan
data = data.sort_values('travel_date')
data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_ticket,day,daily_mean,daily_min,daily_max,tf_mean,tf_min,tf_max
4643,1442,2017-10-17,7.25,Migori,Bus,49,1.0,Tue,,,,,,
4614,14304,2017-11-14,5.166667,Kisii,Bus,49,1.0,Tue,,,,,,
4644,5437,2017-11-19,7.2,Migori,Bus,49,1.0,Sun,,,,,,
714,5710,2017-11-26,7.083333,Keroka,Bus,49,1.0,Sun,,,,,,
4599,13577,2017-11-27,9.0,Kisii,shuttle,11,11.0,Mon,,,,,,


# Adding in the Uber data from notebook 2

In [86]:
# Reading in uber.csv
uber=pd.read_csv('uber.csv')
# converting travel date as datetime
uber.travel_date = pd.to_datetime(uber.travel_date, dayfirst=True)

In [None]:
# Setting travel_date as the index as easier to use loc and dates are unique
uber = uber.set_index('travel_date').sort_index()

In [88]:
# Changing order of columns with day and daily mean at the far left
uber = uber[['day','daily_mean', 'daily_min', 'daily_max', 'am_mean', 'am_min', 'am_max',
       'pm_mean', 'pm_min', 'pm_max', 'mid_mean', 'mid_min', 'mid_max',
       'eve_mean', 'eve_min', 'eve_max', 'em_mean', 'em_min', 'em_max']]
uber.head()

Unnamed: 0_level_0,day,daily_mean,daily_min,daily_max,am_mean,am_min,am_max,pm_mean,pm_min,pm_max,mid_mean,mid_min,mid_max,eve_mean,eve_min,eve_max,em_mean,em_min,em_max
travel_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-10-17,Tue,2734.0,1947.0,4061.0,2898.0,2032.0,4365.0,2978.0,2014.0,4864.0,2715.0,1967.0,3880.0,2496.0,1862.0,3424.0,2467.0,1851.0,3353.0
2017-10-18,Wed,2149.0,1500.0,3245.0,2188.0,1564.0,3141.0,2526.0,1715.0,3881.0,2122.0,1505.0,3108.0,1926.0,1407.0,2714.0,1943.0,1421.0,2728.0
2017-10-19,Thur,2752.0,1757.0,4547.0,2562.0,1722.0,3869.0,3345.0,2013.0,5915.0,2788.0,1813.0,4415.0,2579.0,1685.0,4125.0,2390.0,1635.0,3532.0
2017-10-20,Fri,2464.0,1517.0,4034.0,2454.0,1512.0,4015.0,2467.0,1527.0,4002.0,2515.0,1522.0,4239.0,2433.0,1514.0,3924.0,2448.0,1513.0,3985.0
2017-10-21,Sat,2069.0,1513.0,2923.0,2082.0,1529.0,2910.0,2065.0,1513.0,2909.0,2126.0,1550.0,3002.0,2011.0,1488.0,2793.0,1994.0,1478.0,2767.0


In [89]:

# # Iterating through mobi data to insert timeframe data appropriate for arrival in Nairobi based on 9 hour journey
# # and uber times at destination of: mid = 10-16, pm = 16=19, eve = 19-24, em = 00-7
# # For each date... identify phase of time frame....and copy relvant measure from uber into data
# New column time_frame containing reference to arrival time in central Nairobi
data.loc[data.travel_time <7, 'time_frame'] = 'mid'
data.loc[(data.travel_time >=7) & (data.travel_time< 10), 'time_frame'] = 'pm'
data.loc[(data.travel_time >=10) & (data.travel_time< 17), 'time_frame'] = 'eve'
data.loc[(data.travel_time >=17) & (data.travel_time< 22), 'time_frame'] = 'em'
data.loc[(data.travel_time >=22) & (data.travel_time< 24), 'time_frame'] = 'am'
data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_ticket,day,daily_mean,daily_min,daily_max,tf_mean,tf_min,tf_max,time_frame
4643,1442,2017-10-17,7.25,Migori,Bus,49,1.0,Tue,,,,,,,pm
4614,14304,2017-11-14,5.166667,Kisii,Bus,49,1.0,Tue,,,,,,,mid
4644,5437,2017-11-19,7.2,Migori,Bus,49,1.0,Sun,,,,,,,pm
714,5710,2017-11-26,7.083333,Keroka,Bus,49,1.0,Sun,,,,,,,pm
4599,13577,2017-11-27,9.0,Kisii,shuttle,11,11.0,Mon,,,,,,,pm


In [93]:
# Iterating the daily uber data into the mobi data 
dailies = ['daily_mean', 'daily_min', 'daily_max']
for date in data.travel_date:
    for daily in dailies:
        data.loc[data.index[data.travel_date == date], daily] = uber.at[date, daily] # THis needed to ensure INDEX is used
data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_ticket,day,daily_mean,daily_min,daily_max,tf_mean,tf_min,tf_max,time_frame
4643,1442,2017-10-17,7.25,Migori,Bus,49,1.0,Tue,2734.0,1947.0,4061.0,,,,pm
4614,14304,2017-11-14,5.166667,Kisii,Bus,49,1.0,Tue,2974.0,1806.0,5122.0,,,,mid
4644,5437,2017-11-19,7.2,Migori,Bus,49,1.0,Sun,1905.0,1437.0,2598.0,,,,pm
714,5710,2017-11-26,7.083333,Keroka,Bus,49,1.0,Sun,2181.0,1474.0,3350.0,,,,pm
4599,13577,2017-11-27,9.0,Kisii,shuttle,11,11.0,Mon,3121.0,1634.0,6066.0,,,,pm


In [95]:

measures = ['_mean', '_min', '_max']
for date in data.travel_date:
    for measure in measures:
            data.loc[data.index[(data.travel_date == date) & (data.time_frame == 'eve')], 'tf' + measure ] = uber.at[date, 'eve' + measure]

for date in data.travel_date:
    for measure in measures:
            data.loc[data.index[(data.travel_date == date) & (data.time_frame == 'am')], 'tf' + measure ] = uber.at[date, 'am' + measure]

for date in data.travel_date:
    for measure in measures:
            data.loc[data.index[(data.travel_date == date) & (data.time_frame == 'pm')], 'tf' + measure ] = uber.at[date, 'pm' + measure]

for date in data.travel_date:
    for measure in measures:
            data.loc[data.index[(data.travel_date == date) & (data.time_frame == 'em')], 'tf' + measure ] = uber.at[date, 'em' + measure]

for date in data.travel_date:
    for measure in measures:
            data.loc[data.index[(data.travel_date == date) & (data.time_frame == 'mid')], 'tf' + measure ] = uber.at[date, 'mid' + measure]

In [96]:
data.head(10)

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_ticket,day,daily_mean,daily_min,daily_max,tf_mean,tf_min,tf_max,time_frame
4643,1442,2017-10-17,7.25,Migori,Bus,49,1.0,Tue,2734.0,1947.0,4061.0,2978.0,2014.0,4864.0,pm
4614,14304,2017-11-14,5.166667,Kisii,Bus,49,1.0,Tue,2974.0,1806.0,5122.0,3011.0,1859.0,5007.0,mid
4644,5437,2017-11-19,7.2,Migori,Bus,49,1.0,Sun,1905.0,1437.0,2598.0,1921.0,1439.0,2651.0,pm
714,5710,2017-11-26,7.083333,Keroka,Bus,49,1.0,Sun,2181.0,1474.0,3350.0,2115.0,1458.0,3133.0,pm
4599,13577,2017-11-27,9.0,Kisii,shuttle,11,11.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm
4598,13529,2017-11-27,9.166667,Kisii,shuttle,11,10.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm
4597,13528,2017-11-27,9.5,Kisii,shuttle,11,11.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm
4596,13527,2017-11-27,9.333333,Kisii,shuttle,11,9.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm
4595,13480,2017-11-27,9.666667,Kisii,shuttle,11,11.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm
4594,13479,2017-11-27,9.833333,Kisii,shuttle,11,11.0,Mon,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,pm


In [97]:
# Taking out data for exploration in notebook 1
data.to_csv('data_explore.csv')

In [98]:
# Number of journeys arriving in the centre at each time frame
data.time_frame.value_counts()

pm     4915
mid    1133
eve     903
em      267
am      142
Name: time_frame, dtype: int64

# Preprocessing

In [99]:
# Travel date no longer required in analysis (but maybe useful later if adjust for holidays etc)
data = data.drop('travel_date', axis=1)
data = data.drop('time_frame', axis=1)

In [100]:
# Using get_dummies for one hot encoding of categoricals ie car_type, 'travel_from' and 'day'
data = pd.get_dummies(data, prefix=None)

In [101]:
data.shape

(7360, 36)

In [102]:
data.head()

Unnamed: 0,ride_id,travel_time,max_capacity,number_of_ticket,daily_mean,daily_min,daily_max,tf_mean,tf_min,tf_max,...,travel_from_Sori,car_type_Bus,car_type_shuttle,day_Fri,day_Mon,day_Sat,day_Sun,day_Thur,day_Tue,day_Wed
4643,1442,7.25,49,1.0,2734.0,1947.0,4061.0,2978.0,2014.0,4864.0,...,0,1,0,0,0,0,0,0,1,0
4614,14304,5.166667,49,1.0,2974.0,1806.0,5122.0,3011.0,1859.0,5007.0,...,0,1,0,0,0,0,0,0,1,0
4644,5437,7.2,49,1.0,1905.0,1437.0,2598.0,1921.0,1439.0,2651.0,...,0,1,0,0,0,0,1,0,0,0
714,5710,7.083333,49,1.0,2181.0,1474.0,3350.0,2115.0,1458.0,3133.0,...,0,1,0,0,0,0,1,0,0,0
4599,13577,9.0,11,11.0,3121.0,1634.0,6066.0,3634.0,1840.0,7340.0,...,0,0,1,0,1,0,0,0,0,0


In [103]:
# Creating a csv of this dataframe as it will be used as basis for building future models
data.to_csv('data_uber.csv', index=False)

In [104]:
# Separating out the original train and test set from the source data
train_X = data[data['number_of_ticket'].notnull()]
train_y = train_X['number_of_ticket']
del train_X['number_of_ticket']
del train_X['ride_id']

# Creating the test set
test_X =  data[data['number_of_ticket'].isnull()]
# creating a series for test_X ride_id to use when creating submissions
ride_id_df = test_X['ride_id']
del test_X['number_of_ticket']
del test_X['ride_id']

In [105]:
print(train_X.shape)
print(train_y.shape)

(6249, 34)
(6249,)


# Building the model - Random forests out of box

In [106]:
# Name and set parameters for the model out of the box - in theis case a random forest
forest_model = RandomForestRegressor(random_state=1, criterion ='mae')
# Fit the model to the training data
forest_model.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [107]:
# Making predictions for the first few rows of the training set or sanity check
print("The predictions of the first five rows are", forest_model.predict(train_X.head()))

The predictions of the first five rows are [17.5   1.2   7.2   2.    9.65]


In [108]:
# Now run the model on the training set and see what it predicts
forest_model_preds = forest_model.predict(train_X)
# Compare the predctions with the actual training values using mae as specified int he competition
print ('mae: run on full training set',mean_absolute_error(forest_model_preds, train_y))
# generated score - level of accuracy 
print ('train score:', forest_model.score( train_X , train_y ))

mae: run on full training set 1.569323091694671
train score: 0.9011774775167772


In [109]:
# Running on the final Zindi set
# Run the model on the test set 
forest_model_pred_y = forest_model.predict(test_X)

In [110]:
# creating the csv
submission_data = {'ride_id': ride_id_df, 'number_of_ticket': forest_model_pred_y}
submission_data = pd.DataFrame(data=submission_data)
submission_data = submission_data[['ride_id','number_of_ticket']]
submission_data.head()

Unnamed: 0,ride_id,number_of_ticket
6816,13856,20.0
6817,13857,2.5
6807,13847,5.5
6819,13859,8.7
6815,13855,11.7


In [111]:
submission_data.to_csv('predictions2_rf0.csv', index=False) #save to csv file

In [112]:
# Results on Zindi 4.063
# Position 44
# Learning points: Overfitting. Need to see benefits parameter tuning will bring

# Creating a validation set from within the training set.

In [114]:
# set the validation model with train_v_X being the training element, test_v_X being the test element and same for y
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
train_v_X , test_v_X , train_v_y , test_v_y = train_test_split( train_X , train_y , test_size = .2, random_state=0 )
# Printing the shape of the
print( train_v_X.shape, train_v_y.shape)
print (test_v_X.shape, test_v_y.shape)

(4999, 34) (4999,)
(1250, 34) (1250,)


In [115]:
# Creat a new model (forest_model1) based on the validation training set to test on validation set
# Name and set parameters for the model out of the box - in theis case a random forest
forest_model1 = RandomForestRegressor(random_state=1, criterion ='mae')

In [116]:
# Fit the random forest model to the train validation data
forest_model1.fit(train_v_X, train_v_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [117]:
# How did the model perform on the test valudation data
forest_model1_val_preds = forest_model1.predict(test_v_X)
forest_model1_val_preds[0:5]

array([18.9,  3.7,  2.1,  4. ,  2.2])

In [118]:
# Compare the predctions with the validation test results ie test_v_y
print ('mae: validation prediction / test:',mean_absolute_error(forest_model1_val_preds, test_v_y))
# generated score - level of accuracy but not sure how its measured
print ('validation test score:', forest_model1.score( test_v_X , test_v_y ))

mae: validation prediction / test: 3.62788
validation test score: 0.5292437509665859


In [119]:
# Running on the final Zindi set
# Run the model on the test set 
forest_model1_pred_y = forest_model1.predict(test_X)

In [120]:
# creating the csv
submission_data = {'ride_id': ride_id_df, 'number_of_ticket': forest_model1_pred_y}
submission_data = pd.DataFrame(data=submission_data)
submission_data = submission_data[['ride_id','number_of_ticket']]
submission_data.head()

Unnamed: 0,ride_id,number_of_ticket
6816,13856,17.7
6817,13857,4.8
6807,13847,4.4
6819,13859,8.6
6815,13855,10.4


In [121]:
submission_data.to_csv('predictions_uber_rf1_validation.csv', index=False) #save to csv file

In [122]:
print('Model 0: mae on self', mean_absolute_error(forest_model_preds, train_y))
print ('Model 0: train score / test score', forest_model.score( train_v_X , train_v_y ) , forest_model.score( test_v_X , test_v_y ))
print('Model 1: mae on validation data', mean_absolute_error(forest_model1_val_preds, test_v_y))
print ('Model 1: train score / test score', forest_model1.score( train_v_X , train_v_y ) , forest_model1.score( test_v_X , test_v_y ))

Model 0: mae on self 1.569323091694671
Model 0: train score / test score 0.8991069106529078 0.9098188406924221
Model 1: mae on validation data 3.62788
Model 1: train score / test score 0.8967692061262742 0.5292437509665859


In [123]:
# Results on Zindi 4.116
# Position low
# Learning points
# Comments: 
# The scores based on validation data were lower than those acheived when tested on itself suggesting
# overfitting in the first test above. This is to be expected considering its tested on itself. 
# Otherwise results were quite reasonable

# Results of running the model on Zindi were lower than achieved using full training set.

# Selecting max number of leaf nodes manually, using validation set

In [124]:
# Identifying optimum number of leaf nodes by sampling a few key values
# Formula taken from Kaggle tutorial on ML not my own
# first set up a formula for generating number of leaf nodes and mean absolue errors
def get_mae(max_leaf_nodes, train_v_X, test_v_X, train_v_y, test_v_y):
    forest_model2 = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1, criterion ='mae')
    forest_model2.fit(train_v_X, train_v_y)
    preds_val = forest_model2.predict(test_v_X)
    mae = mean_absolute_error(test_v_y, preds_val)
    return(mae)

In [125]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 10, 25, 50, 100, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_v_X, test_v_X, train_v_y, test_v_y)
    print("Max leaf nodes: {}  \t\t Mean Absolute Error:  {}" .format(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  4.6624
Max leaf nodes: 10  		 Mean Absolute Error:  3.9624
Max leaf nodes: 25  		 Mean Absolute Error:  3.6592
Max leaf nodes: 50  		 Mean Absolute Error:  3.5136
Max leaf nodes: 100  		 Mean Absolute Error:  3.4948
Max leaf nodes: 500  		 Mean Absolute Error:  4.042
Max leaf nodes: 5000  		 Mean Absolute Error:  4.5184


In [126]:
# Repeat by honing in on 100
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [60, 70, 75, 80, 90, 100, 110, 120]:
    my_mae = get_mae(max_leaf_nodes, train_v_X, test_v_X, train_v_y, test_v_y)
    print("Max leaf nodes: {}  \t\t Mean Absolute Error:  {}" .format(max_leaf_nodes, my_mae))

Max leaf nodes: 60  		 Mean Absolute Error:  3.4756
Max leaf nodes: 70  		 Mean Absolute Error:  3.464
Max leaf nodes: 75  		 Mean Absolute Error:  3.476
Max leaf nodes: 80  		 Mean Absolute Error:  3.4924
Max leaf nodes: 90  		 Mean Absolute Error:  3.5008
Max leaf nodes: 100  		 Mean Absolute Error:  3.4948
Max leaf nodes: 110  		 Mean Absolute Error:  3.5812
Max leaf nodes: 120  		 Mean Absolute Error:  3.5976


In [135]:
# Name and set parameters for the model - in theis case a random forest
forest_model3 = RandomForestRegressor(random_state=1, criterion ='mae', max_leaf_nodes=70)
# Fit the model to the training data
forest_model3.fit(train_v_X, train_v_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=70,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [136]:
# Now run the model on the training set and see what it predicts
forest_model3_preds = forest_model3.predict(test_v_X)
# Compare the predctions with the actual training values using mae as specified int he competition
print ('mae:',mean_absolute_error(forest_model3_preds, test_v_y))
# generated score - level of accuracy but not sure how its measured
print ('train score:', forest_model3.score( test_v_X , test_v_y ))

mae: 3.5170800000000004
train score: 0.5313745729245736


In [137]:
print('Model 0: mae on self', mean_absolute_error(forest_model_preds, train_y))
print ('Model 0: train score / test score', forest_model.score( train_v_X , train_v_y ) , forest_model.score( test_v_X , test_v_y ))
print('Model 1: mae on validation data', mean_absolute_error(forest_model1_val_preds, test_v_y))
print ('Model 1: train score / test score', forest_model1.score( train_v_X , train_v_y ) , forest_model1.score( test_v_X , test_v_y ))
# print('Model 2: mae on self', mean_absolute_error(forest_model2_preds, train_y))
# print ('Model 2: train score / test score', forest_model2.score( train_v_X , train_v_y ) , forest_model2.score( test_v_X , test_v_y ))
print('Model 3: mae on validation data', mean_absolute_error(forest_model3_preds, test_v_y))
print ('Model 3: train score / test score', forest_model3.score( train_v_X , train_v_y ) , forest_model3.score( test_v_X , test_v_y ))


Model 0: mae on self 1.569323091694671
Model 0: train score / test score 0.8991069106529078 0.9098188406924221
Model 1: mae on validation data 3.62788
Model 1: train score / test score 0.8967692061262742 0.5292437509665859
Model 3: mae on validation data 3.5170800000000004
Model 3: train score / test score 0.6047670180724635 0.5313745729245736


# Running the max_leaf model on the test set to generate a submission

In [130]:
# Run the model to generate predictions for test set
forest_model3_pred = forest_model3.predict(test_X)

In [131]:
# creating the csv
submission_data = {'ride_id': ride_id_df, 'number_of_ticket': forest_model3_pred}
submission_data = pd.DataFrame(data=submission_data)
submission_data = submission_data[['ride_id','number_of_ticket']]
submission_data.head()

Unnamed: 0,ride_id,number_of_ticket
6816,13856,21.65
6817,13857,1.2
6807,13847,3.6
6819,13859,8.05
6815,13855,6.95


In [132]:
submission_data.to_csv('predictions2_uber_rf2_max_leaf_validation.csv', index=False) #save to csv file

In [133]:
# score 5.42 position 40 out of 56
# Amended to exclude ride id and now 3.811 and 10th out of 56
# Amended to 3.77615 and again 10th out of


# Introducing uber data gives 3.864 and 21st on leaderboard

# Experimenting with feature importance but no improvement found
https://chrisalbon.com/machine_learning/trees_and_forests/feature_selection_using_random_forest/
https://www.kaggle.com/niklasdonges/end-to-end-project-with-python/notebook

In [139]:
# Creating a table of features and their importance (gini impurity) for the forest model max leaf model, sorting in order
importances = pd.DataFrame({'feature':train_v_X.columns,'importance':np.round(forest_model3.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')

In [140]:
importances

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
travel_time,0.358
travel_from_Kisii,0.118
travel_from_Sirare,0.106
travel_from_Homa Bay,0.084
travel_from_Migori,0.074
travel_from_Kehancha,0.028
tf_min,0.025
daily_min,0.022
tf_max,0.019
daily_mean,0.019


In [141]:
# Isolating the features which have threshold above 0.01 using the select from model transformer onto the model
sfm = SelectFromModel(forest_model3, threshold=0.01)

In [142]:
# Train the selector by fitting feature reduced model to validation training data
sfm.fit(train_v_X, train_v_y)

SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=70,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=0.01)

In [144]:
# Redefining the train and test sets to only include important features. y values do not change as do not have features
X_important_v_train = sfm.transform(train_v_X)
X_important_v_test = sfm.transform(test_v_X)

In [145]:
# Create a new random forest regressor for the most important features
rf_important = RandomForestRegressor(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
rf_important.fit(X_important_v_train, train_v_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [146]:
# Apply The Full Featured Classifier To The Test Data
y_pred = forest_model3.predict(test_v_X)
print ('mae:',mean_absolute_error(y_pred, test_v_y))
# View The Accuracy Of Our Full Feature (4 Features) Model
forest_model3.score(test_v_X, test_v_y)

mae: 3.5170800000000004


0.5313745729245736

In [148]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = rf_important.predict(X_important_v_test)
print ('mae:',mean_absolute_error(y_important_pred, test_v_y))
# View The Accuracy Of Our Limited Feature (2 Features) Model
rf_important.score(X_important_v_test, test_v_y)

mae: 3.7373169338470418


0.539348271052106

In [149]:
### Running new model on complete test set to reduce features
X_important_train = sfm.transform(train_X)
X_important_test = sfm.transform(test_X)

In [150]:
# Identifying optimum number of leaf nodes by sampling a few key values
# first set up a formula for generating number of leaf nodes and mean absolue errors
def get_mae(max_leaf_nodes, X_important_v_train, X_important_v_test, train_v_y, test_v_y):
    decisiontreeregressor = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0, criterion ='mae')
    decisiontreeregressor.fit(X_important_v_train, train_v_y)
    preds_val = decisiontreeregressor.predict(X_important_v_test)
    mae = mean_absolute_error(test_v_y, preds_val)
    return(mae)

In [151]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [60, 70, 75, 80, 90, 100, 110, 120]:
    my_mae = get_mae(max_leaf_nodes, X_important_v_train, X_important_v_test, train_v_y, test_v_y)
    print("Max leaf nodes: {}  \t\t Mean Absolute Error:  {}" .format(max_leaf_nodes, my_mae))

Max leaf nodes: 60  		 Mean Absolute Error:  3.5316
Max leaf nodes: 70  		 Mean Absolute Error:  3.5784
Max leaf nodes: 75  		 Mean Absolute Error:  3.574
Max leaf nodes: 80  		 Mean Absolute Error:  3.5884
Max leaf nodes: 90  		 Mean Absolute Error:  3.6436
Max leaf nodes: 100  		 Mean Absolute Error:  3.6832
Max leaf nodes: 110  		 Mean Absolute Error:  3.7152
Max leaf nodes: 120  		 Mean Absolute Error:  3.7724


In [152]:
# Create a new random forest regressor for the most important features. But this has not been tuned
rf_important = RandomForestRegressor(max_leaf_nodes = 70, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
rf_important.fit(X_important_train, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=70,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [153]:
# Run the model to generate predictions for full  test set
y_important_pred = rf_important.predict(X_important_test)

In [154]:
# creating the csv
submission_data = {'ride_id': ride_id_df, 'number_of_ticket': y_important_pred}
submission_data = pd.DataFrame(data=submission_data)
submission_data = submission_data[['ride_id','number_of_ticket']]
submission_data.head()

Unnamed: 0,ride_id,number_of_ticket
6816,13856,21.633221
6817,13857,11.797062
6807,13847,4.277168
6819,13859,6.695055
6815,13855,6.695055


In [155]:
submission_data.to_csv('predictions2_uber_rf2_max_leaf_validation_important_features.csv', index=False) #save to csv file

In [156]:
# Zindi score 4.3167 but this is without tuning
# Xindi score 4.0807 after max leaf tuning based on the

In [157]:
# These results are higher than the results for the full test so no further testing