In [787]:
ROWS = 150000
NODES = [100,100,50]

## Import Libraries

In [813]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from keras.models import model_from_json
from sklearn.externals import joblib
from keras.layers import Dropout
from keras.constraints import maxnorm


pd.set_option('display.max_columns', 50)

## Data Preparation

In [712]:
def file_to_dataFrame(file_name, subset=True, nrows=ROWS):
    if subset:
        df = pd.read_csv(file_name, nrows=nrows, parse_dates=['pickup_datetime'])
    else:
        df = pd.read_csv(file_name, parse_dates=['pickup_datetime'])
    return df

In [713]:
def make_Xtest_ytest(df):
    y_test = df['key']
    y_test = pd.DataFrame(y_test)
    X_test = df.drop('key', axis=1)
    return X_test, y_test

## Clean Data

In [714]:
def clean_data(df):
    print('dropping nan:', len(df))
    df = df.dropna(axis=0, subset=['dropoff_latitude'])
    df = df.drop('key', axis=1)
    print('nan dropped:', len(df))
    return df

In [715]:
def lat_lon_US(df):
    # Choose cab rides whose pickup and dropoff are the US Mainland
    # Declare constants
    latmin = 5.496100
    latmax = 71.538800
    longmin = -124.482003
    longmax = -66.885417

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) & ((((df['dropoff_longitude']<=longmax) & (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    
    print('US Mainland Only dropped:', len(df))

    return df

In [716]:
def lat_lon_NYC(df):
    # Find cab rides whose pickup or dropoff are within NYC boundaries
    # Declare constants
    latmin = 40.477399
    latmax = 40.917577
    longmin = -74.259090
    longmax = -73.700272

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) | ((((df['dropoff_longitude']<=longmax) % (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    
    print('NYC Taxis Only:', len(df))

    return df

In [717]:
def max_Riders(df, num=7):
    # Only choose cabs between 1 and num riders
    df = df[(df['passenger_count'] <= num) & (df['passenger_count'] > 0)]
    print('Max Passengers 7:', len(df))
    return df

In [718]:
def add_distance(df):

    # Define coordinates (x,y)
    x1 = df['pickup_latitude']
    y1 = df['pickup_longitude']
    x2 = df['dropoff_latitude']
    y2 = df['dropoff_longitude']

    # Create Euclidean Distrance column
    df['euclidean_distance'] = np.sqrt((y2-y1)**2 + (x2-x1)**2)

    # Create Taxicab Distance column
    df['taxicab_distance'] = np.abs(y2-y1) + np.abs(x2-x1)

    # Convert to miles
    df['euclidean_distance'] = df['euclidean_distance'] * 69
    df['taxicab_distance'] = df['taxicab_distance'] * 69
    
    print('Distance Columns added...')

    return df

In [719]:
def min_Fare(df):
    # Eliminate unrealistic plots
    df = df[df['fare_amount'] >= (df['euclidean_distance'] * 2 + 2.5)]
    print('Min fares dropped:', len(df))

    return df

In [720]:
def max_Fare(df):
    df = df[(df['fare_amount'] <= (df['taxicab_distance'] * 48 + 16)) | (df['fare_amount'] <= 56)]
    print('Max fares dropped:', len(df))
    return df

In [721]:
def no_distance(df):
    # Elminate fares that traveled no distance
    df = df[df['euclidean_distance']>0]
    print('No distance dropped:', len(df))
    return df

In [722]:
def row_elimination(df):
    df = clean_data(df)
    df = lat_lon_US(df)
    df = lat_lon_NYC(df)
    df = max_Riders(df)
    df = add_distance(df)
    df = min_Fare(df)
    df = max_Fare(df)
    df = no_distance(df)
    return df

## X_train, y_train Columns

In [723]:
def make_X_y(df):
    X = df.drop('fare_amount', axis=1)
    y = df['fare_amount'].copy()
    return X,y

## Garbage Removal

In [808]:
# Get rid of accumulated garbage
import gc
gc.collect()

32698

## Add Attributes

### Time

In [725]:
def add_Time_units(df):
    
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['second'] = df['pickup_datetime'].dt.second
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek
    
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
    dr = pd.date_range(start='2009-01-01', end='2015-12-31')
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    df['holiday'] = df['pickup_datetime'].dt.date.astype('datetime64').isin(holidays)
    
    df = df.drop('pickup_datetime', axis=1)

    df['total_seconds'] = 3600 * df['hour'] + 60 * df['minute'] + df['second']
        
    return df

In [752]:
def add_Time_columns(df):
    
    def morning_rush(row):
        if ((row['hour'] in [6,7,8,9]) & (row['dayofweek'] in [0,1,2,3,4])) & (not row['holiday']):
            return 1
        else:
            return 0

    df['morning_rush'] = df.apply(morning_rush, axis=1)

    def night_charge(row):
        if row['hour'] in [20,21,22,23,24,1,2,3,4,5,6]:
            return 1
        else:
            return 0

    df['night_charge'] = df.apply(night_charge, axis=1)

    def weekday_surcharge(row):
        if ((row['hour'] in [16,17,18,19,20]) & (row['dayofweek'] in [0,1,2,3,4])) & (not row['holiday']):
            return 1
        else:
            return 0

    df['weekday_surcharge'] = df.apply(weekday_surcharge, axis=1)
        
    return df

In [727]:
def add_Time(df):
    df = add_Time_units(df)
    df = add_Time_columns(df)
    return df

### Manhattan

In [728]:
# Define line from two points and a provided column
def two_points_line(a, b, column):
        
    # Case when y-values are the same
    if b[1]==a[1]:
        
        # Slope defaults to 0
        slope = 0
        
    # Case when x-values are the same
    elif b[0]==a[0]:
        
        # Case when max value is less than 999999999
        if column.max() < 999999999:
            
            # Add 999999999 to max value
            slope = column.max() + 999999999
        
        # All other cases
        else:
            
            # Multiply max value by itself (greater than 999999999)
            slope = column.max() * column.max()
    
    # When x-values and y-values are not 0
    else:
        
        # Use standard slope formula
        slope = (b[1] - a[1])/(b[0]-a[0])
    
    
    # Equation for y-intercept (solving y=mx+b for b)
    y_int = a[1] - slope * a[0]
    
    # Return slope and y-intercept
    return slope, y_int

In [729]:
def manhattan_cols(df):
    
    upper_right = (-73.929224, 40.804328)
    bottom_right = (-73.980036, 40.710706)
    bottom_left = (-74.054880, 40.681292)
    upper_left = (-73.966303, 40.830050)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def manhattan(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) & (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['manhattan'] = df.apply(manhattan, axis=1)
        
    return df

In [730]:
def newark_cols(df):
    
    upper_right = (-74.107867, 40.718282)
    bottom_right = (-74.143665, 40.654673)
    bottom_left = (-74.250524, 40.698436)
    upper_left = (-74.171983, 40.792347)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def newark(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) | (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['newark'] = df.apply(newark, axis=1)
    
    return df

In [731]:
def jkf_cols(df):
    
    upper_right = (-73.789700, 40.663781)
    bottom_right = (-73.762112, 40.633567)
    bottom_left = (-73.818920, 40.642250)
    upper_left = (-73.804656, 40.664858)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def jfk(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] <= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) | (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['jfk'] = df.apply(jfk, axis=1)
            
    return df

In [732]:
def add_locations(df):
    df = manhattan_cols(df)
    df = jkf_cols(df)
    df = newark_cols(df)
    return df

In [733]:
def add_cols(df):
    df = add_Time(df)
    df = add_locations(df)
    return df

## Choose Columns

In [734]:
def choose_predictor_cols(df, cols=['month', 'year', 'dayofweek', 'total_seconds', 'morning_rush', 'night_charge', 'weekday_surcharge', 'manhattan', 'jfk', 'newark', 'passenger_count','euclidean_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']):
    X = df[cols]
    return X

## Min Max Scaler

In [735]:
def min_max_scaler(X):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_df

## One Hot Encoder

In [736]:
def one_hot_cols(X):
    X = one_Hot_Encoder(X, X['month'])
    del X['month']
    X = one_Hot_Encoder(X, X['dayofweek'], month=False)
    del X['dayofweek']
    return X

In [737]:
def one_Hot_Encoder(X, col, month=True): 
    encoder = OneHotEncoder()
    hot_array = encoder.fit_transform(np.array(col).reshape(-1,1)).toarray()
    hot_df = pd.DataFrame(hot_array)
    if month:
        hot_df.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    else:
        hot_df.columns = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    new_df = X.join(hot_df)
    return new_df

## Reset Index

In [738]:
def reset_index(X):
    X = X.reset_index(drop=True)
    return X

## Pipeline

In [739]:
def my_pipeline(test_set=False):
    
    if test_set:
        df = file_to_dataFrame('test.csv')
        X, y = make_Xtest_ytest(df)
        X = add_distance(X)
    
    else:
        df = file_to_dataFrame('train.csv')
        df = row_elimination(df)
        X, y = make_X_y(df)
    
    X = reset_index(X)
    X = add_cols(X)
    X = choose_predictor_cols(X)
    X = one_hot_cols(X)
    X = min_max_scaler(X)
    
    return X, y

## ML Tests

### Linear Regression

In [781]:
def linear_regression():
        
    print('Length of X:', len(X_train))
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    scores = cross_val_score(lr_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    print('Lin reg train rmse:', rmse)
    print('Lin reg train mean:', rmse.mean())
    print('Lin reg train std:', rmse.std())
    
    joblib.dump(lr_model, 'lr_model.pkl') 
    
    print('Linear Regression model saved as "lr_model.pkl"')
    
    return lr_model

### Random Forests

In [747]:
def random_random_forest_tuner(X,y):
        
    param_grid = [
        {'n_estimators': [75, 100, 250, 500, 750, 1000, 15000], 'max_features': [5, 10, 15, 20, 25]}, 
    ]
    
    forest_reg = RandomForestRegressor()
    
    forest_reg_tuned = RandomSearchCV(forest_reg, param_grid, n_iter=6, cv=3, 
                                    scoring='neg_mean_squared_error')
    
    forest_reg_tuned.fit(X,y)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(forest_reg_tuned.best_params_))
    
    scores = cross_val_score(forest_reg_tuned, X, y, scoring='neg_mean_squared_error', cv=3)
    
    display_scores('Random Forest', scores)
    
    return forest_reg_tuned

In [748]:
def random_forest_tuner(X,y):
        
    param_grid = [
        {'n_estimators': [100, 500, 1000], 'max_features': [10]}, 
    ]
    
    forest_reg = RandomForestRegressor()
    
    forest_reg_tuned = GridSearchCV(forest_reg, param_grid, cv=3, 
                                    scoring='neg_mean_squared_error')
    
    forest_reg_tuned.fit(X,y)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(forest_reg_tuned.best_params_))
    
    scores = cross_val_score(forest_reg_tuned, X, y, scoring='neg_mean_squared_error', cv=3)
    
    display_scores('Random Forest', scores)
    
    return forest_reg_tuned

In [749]:
def display_scores(title, scores):
    rmse = np.sqrt(-scores)
    print(title, ' rmse scores:', rmse)
    print(title, ' mean score:', rmse.mean())
    print(title, ' std:', rmse.std())

In [750]:
def random_forest():
    
    rf_model = RandomForestRegressor(max_features=10, n_estimators=50)
    
    rf_model.fit(X_train, y_train)
    
    scores = cross_val_score(rf_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    
    display_scores('Random Forest', scores)
    
    joblib.dump(rf_model, 'rf_model.pkl') 
    
    print('Random forest model saves as "rf_model.pkl"')
        
    return rf_model

### Deep Learning (Sequential)

In [815]:
# keras_regression_test requires "from sklearn.model_selection import train_test_split"
def deep_learning(nodes=NODES, batch_size=32, activation='relu', optimizer='adam', loss='mean_squared_error'):
        
    X, X_check, y, y_check = train_test_split(X_train, y_train)
    
    # Save the number of columns in predictors: n_cols
    n_cols = X.shape[1]

    # Set up the model: model
    model = Sequential()
    
    # Add the first layer
    model.add(Dense(nodes[0], activation=activation, input_shape=(n_cols,)))
    
    # Add addition layers
    for i in range(len(nodes)-1):
        model.add(Dense(nodes[i+1], activation=activation, kernel_constraint=maxnorm(3)))
        model.add(Dropout(0.2))

    # Add the output layer
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss)

    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=3)

    # Fit the model
    model.fit(X, y, validation_split=0.05, epochs=30, batch_size=batch_size, callbacks=[early_stopping_monitor])

    # Get score for predictions
    score = model.evaluate(X_check, y_check)
    
    # Get root mean squared error
    rmse = np.sqrt(score)
    
    # Return root mean squared error
    print(rmse)
    
    save_keras_model(model)
    
    return model

In [786]:
def save_keras_model(model):
    # serialize model to JSON
    model_json = model.to_json()
    with open("dl_model.json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("model.h5")
    print("Saved deep learning model as 'dl_model.json'")
    return model
  
def open_keras_model(file):
    # load json and create model
    json_file = open(file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("model.h5")
    print("Loaded model from disk")
    return loaded_model 

In [776]:
def open_model(saved_model, keras=False):
    if keras:
        model = open_keras_model(saved_model)
    else:
        model = joblib.load(saved_model)
    return model

def kaggle_submit(saved_model, keras=False):
    if keras:
        model = open_model(saved_model, keras=True)
    else:
        model = open_model(saved_model)
    y_pred = model.predict(X_test)
    submit_to_kaggle(y_pred)

## Tests

In [753]:
X_train, y_train = my_pipeline()
X_test, y_test = my_pipeline(test_set=True)

dropping nan: 150000
nan dropped: 149999
US Mainland Only dropped: 146911
NYC Taxis Only: 146732
Max Passengers 7: 146192
Distance Columns added...
Min fares dropped: 141891
Max fares dropped: 141807
No distance dropped: 140311
Distance Columns added...


### LR Test

In [782]:
linear_regression()

Length of X: 140311
Lin reg train rmse: [3.56801881 3.75088332 3.76600961 3.56703533 3.63740094]
Lin reg train mean: 3.6578696019310613
Lin reg train std: 0.08612642774919092
Linear Regression model saves as "lr_model.pkl"


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [818]:
kaggle_submit('lr_model.pkl')

                                key  fare_amount
0       2015-01-27 13:08:24.0000002    10.300781
1       2015-01-27 13:08:24.0000003    10.644531
2       2011-10-08 11:53:44.0000002     5.871094
3       2012-12-01 21:12:12.0000002     7.949219
4       2012-12-01 21:12:12.0000003    13.804688
5       2012-12-01 21:12:12.0000005    10.320312
6       2011-10-06 12:10:20.0000001     7.003906
7       2011-10-06 12:10:20.0000003    48.417969
8       2011-10-06 12:10:20.0000002    11.814453
9       2014-02-18 15:22:20.0000002     8.158203
10      2014-02-18 15:22:20.0000003    10.187500
11      2014-02-18 15:22:20.0000001    15.187500
12      2010-03-29 20:20:32.0000002     4.216797
13      2010-03-29 20:20:32.0000001     6.328125
14      2011-10-06 03:59:12.0000002     7.890625
15      2011-10-06 03:59:12.0000001    12.484375
16      2012-07-15 16:45:04.0000006     5.542969
17      2012-07-15 16:45:04.0000002     9.509766
18      2012-07-15 16:45:04.0000003     5.984375
19      2012-07-15 1

### RF Test

In [820]:
random_forest()

Random Forest  rmse scores: [2.93958553 3.1945351  3.20662355 2.97817472 3.201462  ]
Random Forest  mean score: 3.104076179478646
Random Forest  std: 0.11924020203471307


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=40, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [817]:
kaggle_submit('rf_model.pkl')

                                key  fare_amount
0       2015-01-27 13:08:24.0000002     12.79500
1       2015-01-27 13:08:24.0000003     15.44500
2       2011-10-08 11:53:44.0000002      9.15000
3       2012-12-01 21:12:12.0000002     10.45250
4       2012-12-01 21:12:12.0000003     14.40750
5       2012-12-01 21:12:12.0000005     11.94575
6       2011-10-06 12:10:20.0000001     10.35500
7       2011-10-06 12:10:20.0000003     60.55550
8       2011-10-06 12:10:20.0000002     17.90250
9       2014-02-18 15:22:20.0000002     13.79750
10      2014-02-18 15:22:20.0000003     13.85375
11      2014-02-18 15:22:20.0000001     18.09575
12      2010-03-29 20:20:32.0000002     11.01625
13      2010-03-29 20:20:32.0000001     11.38375
14      2011-10-06 03:59:12.0000002     13.10750
15      2011-10-06 03:59:12.0000001     15.84625
16      2012-07-15 16:45:04.0000006     10.31000
17      2012-07-15 16:45:04.0000002     12.03750
18      2012-07-15 16:45:04.0000003      9.72500
19      2012-07-15 1

In [766]:
feature_importances = rf_model.feature_importances_
sorted(zip(feature_importances, list(X_train)), reverse=True)

[(0.5133332720979047, 'euclidean_distance'),
 (0.13953042368532476, 'manhattan'),
 (0.07363370548489193, 'jfk'),
 (0.06393871525252566, 'dropoff_longitude'),
 (0.053156706641823516, 'pickup_longitude'),
 (0.043079284579644266, 'dropoff_latitude'),
 (0.036569960777515426, 'pickup_latitude'),
 (0.019605323230097702, 'year'),
 (0.012460993949790524, 'total_seconds'),
 (0.00916419075100937, 'newark'),
 (0.008808938852134011, '15_min_intervals'),
 (0.002681150543599505, 'passenger_count'),
 (0.0023097732259236762, 'night_charge'),
 (0.0014802166748075797, 'Sep'),
 (0.0014105295197307307, 'Fri'),
 (0.0013284927157269147, 'Sat'),
 (0.0012319062149000225, 'Sun'),
 (0.0012072193884114255, 'Wed'),
 (0.0012060537037168972, 'Dec'),
 (0.0011829885879903284, 'Tue'),
 (0.0011751225329398938, 'Thu'),
 (0.0010962228693957895, 'Oct'),
 (0.0010345787523343814, 'Mon'),
 (0.0009648258025364915, 'Nov'),
 (0.0009606868604274225, 'May'),
 (0.0009415374192659018, 'weekday_surcharge'),
 (0.0008805622533894549, 

### keras Tests

In [793]:
NODES = [130,13,130]
dl_model = deep_learning()

Train on 103128 samples, validate on 2105 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
3.6373370141014076
Saved deep learning model as 'dl_model.json'


In [794]:
kaggle_submit('dl_model.json', keras=True)

Loaded model from disk
                                key  fare_amount
0       2015-01-27 13:08:24.0000002    10.799384
1       2015-01-27 13:08:24.0000003    11.271863
2       2011-10-08 11:53:44.0000002     7.030448
3       2012-12-01 21:12:12.0000002     9.589872
4       2012-12-01 21:12:12.0000003    15.570109
5       2012-12-01 21:12:12.0000005    11.982323
6       2011-10-06 12:10:20.0000001     8.677094
7       2011-10-06 12:10:20.0000003    42.664478
8       2011-10-06 12:10:20.0000002    13.659761
9       2014-02-18 15:22:20.0000002     7.811248
10      2014-02-18 15:22:20.0000003    10.253995
11      2014-02-18 15:22:20.0000001    16.093931
12      2010-03-29 20:20:32.0000002     5.989366
13      2010-03-29 20:20:32.0000001     7.846018
14      2011-10-06 03:59:12.0000002     8.266769
15      2011-10-06 03:59:12.0000001    11.906550
16      2012-07-15 16:45:04.0000006     7.145779
17      2012-07-15 16:45:04.0000002    10.380497
18      2012-07-15 16:45:04.0000003     7.3445

In [None]:
# cuda
# p42xlarge
# ec2 instance pricing
# make sure you have gpu and optimization
# save as pickle file