In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import LabelKFold

from ml_metrics import rmsle

from sklearn.dummy import DummyRegressor 
from sklearn.ensemble import RandomForestRegressor

## Read Data

In [2]:
train = pd.read_csv('train.csv', parse_dates=['datetime'])

## Simple Modeling

In [3]:
def select_features(data):
    black_list = ['casual', 'registered', 'count', 'is_test', 'datetime', 'count_log']
    return [feat for feat in data.columns if feat not in black_list]
    
def get_X_y(data, target_var='count'):
    features = select_features(data)
    return data[features].values, data[target_var].values

def _simple_modeling(X_train, X_test, y_train, y_test):
    models = [
        ('dummy-mean', DummyRegressor(strategy='mean')),
        ('dummy-median', DummyRegressor(strategy='median')),
        ('random-forest', RandomForestRegressor(random_state=0)),
    ]
    
    results = []

    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        results.append((name, y_test, y_pred))
        
    return results


def simple_modeling(X_train, X_test, y_train, y_test):
    results = _simple_modeling(X_train, X_test, y_train, y_test)
    
    return [ (r[0], rmsle(r[1], r[2]) ) for r in results]

In [4]:
X, y = get_X_y(train)
simple_modeling(X, X, y, y)

[('dummy-mean', 1.5691983019475926),
 ('dummy-median', 1.4725894242962372),
 ('random-forest', 0.84028175828825968)]

## Validation

In [5]:
def val(data, n_folds=3):
    feats = select_features(data)
    X = data[feats].values
    y = data['count'].values

    labels = data['datetime'].dt.month.values
    
    for train_idx, test_idx in LabelKFold(labels, n_folds=n_folds):
        yield X[train_idx], X[test_idx], y[train_idx], y[test_idx]

In [6]:
for X_train, X_test, y_train, y_test in val(train):
    print(simple_modeling(X_train, X_test, y_train, y_test))

[('dummy-mean', 1.5389717242983016), ('dummy-median', 1.4518285970779792), ('random-forest', 1.3898355462701857)]
[('dummy-mean', 1.5973803133409017), ('dummy-median', 1.4947795712594232), ('random-forest', 1.3868563304622581)]
[('dummy-mean', 1.5718600070726862), ('dummy-median', 1.4729884592466973), ('random-forest', 1.4279380440008111)]


Let's compare results with  validation and without one.

| name        |  without validation | with validation  | diff in % |
| ------------|:-------------------:|:----------------:|----------:|
|dummy-mean   |     1.57            |      ~1.57        |   ~0%     |
|dummy-median |     1.47            |      ~1.47        |   ~0%     |
|random-forest|     0.84            |      ~1.40        |   ~40%    |

## Questions
1. Why the result for dumy mean/median is the same* with/without validation?
2. Why the result for random-forest is very different with validation and without?
3. Which result is better for us?

# For more advanced
## Understand Better Target Variable

count = registered + casual

In [7]:
def reg_cas_val(data, n_folds=3):
    feats = select_features(data)
    X = data[feats].values
    y = data['count'].values
    y_reg = data['registered'].values
    y_cas = data['casual'].values

    labels = data['datetime'].dt.month.values
    
    for train_idx, test_idx in LabelKFold(labels, n_folds=n_folds):
        yield X[train_idx], X[test_idx], y[train_idx], y_reg[train_idx], y_cas[train_idx], y[test_idx]
        
        

scores_sum = []
scores_c = []

for X_train, X_test, y_train_cnt, y_train_reg, y_train_cas, y_test in reg_cas_val(train): 
    model = RandomForestRegressor(random_state=0)
    
    model.fit(X_train, y_train_cnt)
    y_pred_cnt = model.predict(X_test)
    
    model.fit(X_train, y_train_reg)
    y_pred_reg = model.predict(X_test)

    model.fit(X_train, y_train_cas)
    y_pred_cas = model.predict(X_test)

    y_pred = y_pred_reg + y_pred_cas
    
    scores_c.append( rmsle(y_test, y_pred_cnt) )
    scores_sum.append( rmsle(y_test, y_pred) )
    

print("count: ", np.mean(scores_c), np.std(scores_c), scores_c)
print("registered + casual: ", np.mean(scores_sum), np.std(scores_sum), scores_sum)


('count: ', 1.401543306911085, 0.018703485258713615, [1.3898355462701857, 1.3868563304622581, 1.4279380440008111])
('registered + casual: ', 1.3946435992749022, 0.021842027802280947, [1.4016486462734967, 1.365087126839138, 1.4171950247120721])


## Questions:
1. What does mean this experimetn for you?