# Prediction

In [3]:
## imports
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from globes import taxi_dir, days_dir

from multiprocessing import Pool, Process, cpu_count


In [4]:
## get X, y in numpy arrays from relevant data
FEATURE_COLS = ['pickup_day', 'pickup_hour', 'pickup_latitude', 'pickup_longitude']
Y_COLS = ['dropoff_latitude', 'dropoff_longitude']

""" get feature and label rows from filename for regression
"""
def get_Xy_reg(filename):
    df = pd.read_csv(filename, parse_dates=['pickup_datetime', 'dropoff_datetime'])
    df = df.dropna()
    
    df["pickup_day"] = df['pickup_datetime'].apply(lambda t: t.weekday())
    df["pickup_hour"] = df['pickup_datetime'].apply(lambda t: t.hour)

    df_X = df[FEATURE_COLS]
    df_Y = df[Y_COLS]
    return df_X.fillna(0), df_Y.fillna(0)

""" get X, y in numpy arrays from relevant data
"""
def getDataNumpy(get_Xy_func):
    num_cores = cpu_count()/2
    print "using " + str(num_cores) + " cores"
    pool = Pool(processes=num_cores)

    filenames = [os.path.join(taxi_dir, days_dir, f) for f in os.listdir(os.path.join(taxi_dir, days_dir)) if f.endswith('csv')]
    
    # get X, y dataframes in parallel
    Xy_arr = pool.map(get_Xy_func, filenames)
    pool.terminate()

    # separate out to array of X dataframes and y dataframes
    dfs_X = map(lambda pair: pair[0], Xy_arr)
    dfs_y = map(lambda pair: pair[1], Xy_arr)

    # concatenate dataframe array into single df
    df_X = pd.concat(dfs_X)
    df_y = pd.concat(dfs_y)

    # convert df to numpy arrays
    X = df_X.as_matrix()
    y = df_y.as_matrix()

    print df_X.shape, df_y.shape
    print X.shape, y.shape

    return X, y

X, y = getDataNumpy(get_Xy_reg)

using 28 cores
(77579478, 4) (77579478, 2)
(77579478, 4) (77579478, 2)


## Random forest regressor

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

""" Random Forest Classifier
"""
def RandomForest(X, y):
    X_train = X[:60000000]
    y_train = y[:60000000]
    
    X_test = X[60000000:]
    y_test = y[60000000:]

    reg = RandomForestRegressor(n_estimators=2, max_depth=20, n_jobs=-1, verbose=4, warm_start=True)
    reg.fit(X_train, y_train) #, sample_weight=pickup_count_train.values)
    training_accuracy = reg.score(X_train, y_train)
    valid_accuracy = reg.score(X_test, y_test)
    rmsetrain = np.sqrt(mean_squared_error(reg.predict(X_train),y_train))
    rmsevalid = np.sqrt(mean_squared_error(reg.predict(X_test),y_test))
    
    print " R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid)
    

""" Random Forest Classifier for lat and lon
"""
def RandomForestLatLon(X, y):
    print "predicting latitude, longitude"
    RandomForest(X, y)
    
""" Random Forest Classifier for longitude
"""
def RandomForestLon(X, y):
    y = y[:, 1]
    print "predicting longitude"
    RandomForest(X, y)
    
""" Random Forest Classifier for latitude
"""
def RandomForestLat(X, y):
    y = y[:, 0]
    print "predicting latitude"
    RandomForest(X, y)
    

In [6]:
RandomForestLat(X, y)

predicting latitude
building tree 1 of 2building tree 2 of 2



[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.8min finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.8s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.2s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.4s finished


 R^2 (train) = 0.465, R^2 (valid) = 0.422, RMSE (train) = 0.027, RMSE (valid) = 0.028


In [7]:
RandomForestLatLon(X, y)

predicting latitude, longitude
building tree 1 of 2
building tree 2 of 2


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  3.9min finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.1s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.1s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.2s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   17.3s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    5.3s finished


 R^2 (train) = 0.416, R^2 (valid) = 0.374, RMSE (train) = 0.028, RMSE (valid) = 0.029


In [10]:
RandomForestLon(X, y)

predicting longitude
building tree 1 of 2
building tree 2 of 2


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  4.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  4.0min finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   15.3s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   15.3s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.4s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.4s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   14.8s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:   14.8s finished
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    4.2s finished


 R^2 (train) = 0.376, R^2 (valid) = 0.325, RMSE (train) = 0.030, RMSE (valid) = 0.031
