In [18]:
## Enable matplotlib inline
%matplotlib inline

## Imports
import pandas as pd
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

import numpy as np
from sklearn import ensemble

In [19]:
## Read in training data
train = pd.read_csv('train.csv')

In [20]:
## Read in testing data
test = pd.read_csv('test.csv')

In [21]:
## Make a function to prepare the data for training and testing
def process_data(dataset):
    id_groups = dataset.groupby('Id', as_index=False)
    return id_groups.mean()

In [22]:
## Process training and testing data
train = process_data(train)
test  = process_data(test)

In [23]:
## Clean up training data from outliers
train = train[train['Expected'] < 100.0]
train = train.fillna(0.0)

In [24]:
## Extract numpy arrays to pass to scikit-learn for training
train_data   = train[['radardist_km', 'Ref', 'RefComposite', 'RhoHV', 'Zdr', 'Kdp']].values
train_target = train['Expected'].values

In [25]:
## Train the Boosted Regression trees
regressor = ensemble.GradientBoostingRegressor(n_estimators=500)
regressor.fit(train_data, train_target)

GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             random_state=None, subsample=1.0, verbose=0, warm_start=False)

In [26]:
def predict(radardist_km, Ref, RefComposite, RhoHV, Zdr, Kdp):
    return regressor.predict([radardist_km, Ref, RefComposite, RhoHV, Zdr, Kdp])[0]

In [27]:
## Prepare testing data for evaluation
test = test.fillna(0.0)

In [28]:
test['Expected'] = map(
    predict,
    test['radardist_km'],
    test['Ref'],
    test['RefComposite'],
    test['RhoHV'],
    test['Zdr'],
    test['Kdp']
)

In [29]:
test

Unnamed: 0,Id,minutes_past,radardist_km,Ref,Ref_5x5_10th,Ref_5x5_50th,Ref_5x5_90th,RefComposite,RefComposite_5x5_10th,RefComposite_5x5_50th,RefComposite_5x5_90th,RhoHV,RhoHV_5x5_10th,RhoHV_5x5_50th,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th,Expected
0,1,29.823529,8,10.500000,0.000000,8.277778,13.461538,11.375000,0.000000,8.666667,14.153846,1.011667,0.848333,0.990000,1.043571,-0.546875,-1.750000,0.062500,2.598214,-1.523336,0.000000,-1.290003,2.601664,1.826045
1,2,28.937500,15,0.000000,0.000000,0.000000,13.000000,0.000000,0.000000,0.000000,13.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.095123
2,3,28.700000,11,23.722222,17.125000,24.111111,34.700000,25.055556,16.400000,25.222222,36.350000,0.927333,0.895333,0.954667,0.981333,0.418750,-0.562500,0.456250,1.825000,0.219994,-4.359003,0.169997,5.382997,2.432072
3,4,28.727273,9,30.812500,28.642857,29.812500,35.625000,32.000000,28.944444,32.666667,35.250000,0.967667,0.931667,0.970556,0.982143,0.100000,-0.537500,0.177083,1.241071,0.911996,-2.890003,-0.140002,5.563329,4.031017
4,5,28.333333,17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.716460
5,6,29.692308,13,15.500000,12.500000,13.857143,17.307692,16.357143,12.500000,14.285714,19.615385,0.975000,0.000000,0.931667,1.051667,0.718750,0.000000,-2.625000,1.000000,5.494995,0.000000,-8.770004,9.959991,1.974296
6,7,29.363636,9,29.318182,24.000000,29.181818,34.181818,29.363636,24.045455,29.227273,34.363636,0.972037,0.966667,0.985417,0.996000,0.076389,-0.351562,0.148438,0.881250,-0.221254,-1.323753,0.174994,2.775550,3.526084
7,8,29.428571,15,21.142857,19.250000,22.750000,26.357143,24.464286,22.892857,25.285714,27.964286,0.900000,0.000000,0.955000,1.044167,3.343750,0.000000,-0.750000,3.125000,0.000000,0.000000,0.000000,4.103327,3.489910
8,9,29.142857,15,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.095123
9,10,26.666667,11,29.250000,25.600000,29.000000,34.000000,30.833333,22.833333,29.916667,34.500000,1.002222,0.983000,0.988889,1.002778,0.187500,-0.287500,-0.041667,0.552083,0.000000,0.000000,0.000000,0.209998,4.598955


In [30]:
test[['Id', 'Expected']].to_csv('predictions2.csv', index=False)