In [583]:
ROWS = 6000000
NODES = [100,100,50]

## Import Libraries

In [584]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from keras.models import model_from_json
from sklearn.externals import joblib
from keras.layers import Dropout
from keras.constraints import maxnorm
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge

pd.set_option('display.max_columns', 50)

## Data Preparation

In [585]:
def file_to_dataFrame(file_name, subset=True, nrows=ROWS):
    if subset:
        df = pd.read_csv(file_name, nrows=nrows, parse_dates=['pickup_datetime'])
    else:
        df = pd.read_csv(file_name, parse_dates=['pickup_datetime'])
    return df

In [586]:
def make_Xtest_ytest(df, split=False):
    y_test = df['key']
    y_test = pd.DataFrame(y_test)
    X_test = df.drop('key', axis=1)
    return X_test, y_test

## Clean Data

In [587]:
def clean_data(df):
    x = len(df)
    print('Length of df:', x)
    df = df.dropna(axis=0, subset=['dropoff_latitude'])
    df = df.drop('key', axis=1)
    y = len(df)
    print('NaN dropped:', x-y)
    return df

In [588]:
def lat_lon_US(df):
    x = len(df)
    # Choose cab rides whose pickup and dropoff are the US Mainland
    # Declare constants
    latmin = 5.496100
    latmax = 71.538800
    longmin = -124.482003
    longmax = -66.885417

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) & ((((df['dropoff_longitude']<=longmax) & (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    
    print('US Mainland Only dropped:', x-len(df))

    return df

In [589]:
def lat_lon_NYC(df):
    x = len(df)
    # Find cab rides whose pickup or dropoff are within NYC boundaries
    # Declare constants
    latmin = 40.477399
    latmax = 40.917577
    longmin = -74.259090
    longmax = -73.700272

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) | ((((df['dropoff_longitude']<=longmax) % (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    
    print('NYC Taxis Only dropped:', x-len(df))

    return df

In [590]:
def max_Riders(df, num=6):
    x = len(df)
    # Only choose cabs between 1 and num riders
    df = df[(df['passenger_count'] <= num) & (df['passenger_count'] > 0)]
    print('Max Passengers 6 dropped:',  x-len(df))
    return df

In [591]:
def add_distance(df):

    # Define coordinates (x,y)
    x1 = df['pickup_latitude']
    y1 = df['pickup_longitude']
    x2 = df['dropoff_latitude']
    y2 = df['dropoff_longitude']

    # Create Euclidean Distrance column
    df['euclidean_distance'] = np.sqrt((y2-y1)**2 + (x2-x1)**2)

    # Create Taxicab Distance column
    df['taxicab_distance'] = np.abs(y2-y1) + np.abs(x2-x1)

    # Convert to miles
    df['euclidean_distance'] = df['euclidean_distance'] * 69
    df['taxicab_distance'] = df['taxicab_distance'] * 69
    
    print('Distance Columns added...')

    return df

In [592]:
def min_Fare(df):
    # Eliminate unrealistic plots
    df = df[df['fare_amount'] >= (df['euclidean_distance'] * 2 + 2.5)]
    print('Min fares dropped:', len(df))

    return df

In [593]:
def max_Fare(df):
    df = df[(df['fare_amount'] <= (df['taxicab_distance'] * 48 + 16)) | (df['fare_amount'] <= 56)]
    print('Max fares dropped:', len(df))
    return df

In [594]:
def no_distance(df):
    # Elminate fares that traveled no distance
    df = df[df['euclidean_distance']>0]
    print('No distance dropped:', len(df))
    return df

In [595]:
def distance_cap(df, cap=75):
    df = df[df['euclidean_distance'] < cap]
    print('Distance cap dropped:', len(df))
    return df

In [596]:
def row_elimination(df):
    df = clean_data(df)
    df = lat_lon_US(df)
    df = lat_lon_NYC(df)
    df = max_Riders(df)
    df = add_distance(df)
    #df = min_Fare(df)
    #df = max_Fare(df)
    #df = no_distance(df)
    return df

## X_train, y_train Columns

In [597]:
def make_X_y(df, split=False):
    X = df.drop('fare_amount', axis=1)
    y = df['fare_amount'].copy()
    return X,y

## Garbage Removal

In [598]:
# Get rid of accumulated garbage
import gc
gc.collect()

264

## Add Attributes

### Time

In [599]:
def add_Time_units(df):
    
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['second'] = df['pickup_datetime'].dt.second
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek
    
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
    dr = pd.date_range(start='2009-01-01', end='2015-12-31')
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    df['holiday'] = df['pickup_datetime'].dt.date.astype('datetime64').isin(holidays)
    
    df = df.drop('pickup_datetime', axis=1)

    df['total_seconds'] = 3600 * df['hour'] + 60 * df['minute'] + df['second']
        
    return df

In [600]:
def add_Time_columns(df):
    
    def morning_rush(row):
        if ((row['hour'] in [6,7,8,9]) & (row['dayofweek'] in [0,1,2,3,4])) & (not row['holiday']):
            return 1
        else:
            return 0

    df['morning_rush'] = df.apply(morning_rush, axis=1)

    def night_charge(row):
        if row['hour'] in [20,21,22,23,24,1,2,3,4,5,6]:
            return 1
        else:
            return 0

    df['night_charge'] = df.apply(night_charge, axis=1)

    def weekday_surcharge(row):
        if ((row['hour'] in [16,17,18,19,20]) & (row['dayofweek'] in [0,1,2,3,4])) & (not row['holiday']):
            return 1
        else:
            return 0

    df['weekday_surcharge'] = df.apply(weekday_surcharge, axis=1)
        
    return df

In [601]:
def add_Time(df):
    df = add_Time_units(df)
    df = add_Time_columns(df)
    return df

### Manhattan

In [602]:
# Define line from two points and a provided column
def two_points_line(a, b, column):
        
    # Case when y-values are the same
    if b[1]==a[1]:
        
        # Slope defaults to 0
        slope = 0
        
    # Case when x-values are the same
    elif b[0]==a[0]:
        
        # Case when max value is less than 999999999
        if column.max() < 999999999:
            
            # Add 999999999 to max value
            slope = column.max() + 999999999
        
        # All other cases
        else:
            
            # Multiply max value by itself (greater than 999999999)
            slope = column.max() * column.max()
    
    # When x-values and y-values are not 0
    else:
        
        # Use standard slope formula
        slope = (b[1] - a[1])/(b[0]-a[0])
    
    
    # Equation for y-intercept (solving y=mx+b for b)
    y_int = a[1] - slope * a[0]
    
    # Return slope and y-intercept
    return slope, y_int

In [603]:
def manhattan_cols(df):
    
    upper_right = (-73.929224, 40.804328)
    bottom_right = (-73.980036, 40.710706)
    bottom_left = (-74.054880, 40.681292)
    upper_left = (-73.966303, 40.830050)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def manhattan_pickup(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
    
    df['manhattan_pickup'] = df.apply(manhattan_pickup, axis=1)
    
    
    def manhattan_dropoff(row):
        if (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['manhattan_dropoff'] = df.apply(manhattan_dropoff, axis=1)
    
    
    def manhattan(row):
        if (row['manhattan_pickup']) & (row['manhattan_dropoff']):
            return 1
        else:
            return 0
    
    df['manhattan'] = df.apply(manhattan, axis=1)
    
    
    def manhattan_one_way(row):
        if (not row['manhattan']) & (row['manhattan_pickup']) | (row['manhattan_dropoff']):
            return 1
        else: 
            return 0

    df['manhattan_one_way'] = df.apply(manhattan_one_way, axis=1)
     
        
    return df

In [604]:
# def manhattan_cols(df):
    
#     upper_right = (-73.929224, 40.804328)
#     bottom_right = (-73.980036, 40.710706)
#     bottom_left = (-74.054880, 40.681292)
#     upper_left = (-73.966303, 40.830050)

#     m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
#     m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
#     m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
#     m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

#     def manhattan(row):
#         if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
#         (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
#         ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
#         (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) & (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
#         (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
#         ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
#         (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
#             return 1
#         else:
#             return 0
        
#     df['manhattan'] = df.apply(manhattan, axis=1)
        
#     return df

In [605]:
def newark_cols(df):
    
    upper_right = (-74.107867, 40.718282)
    bottom_right = (-74.143665, 40.654673)
    bottom_left = (-74.250524, 40.698436)
    upper_left = (-74.171983, 40.792347)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def newark(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) | (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['newark'] = df.apply(newark, axis=1)
    
    return df

In [606]:
def jkf_cols(df):
    
    upper_right = (-73.789700, 40.663781)
    bottom_right = (-73.762112, 40.633567)
    bottom_left = (-73.818920, 40.642250)
    upper_left = (-73.804656, 40.664858)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def jfk(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] <= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) | (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['jfk'] = df.apply(jfk, axis=1)
            
    return df

In [607]:
def add_locations(df):
    df = manhattan_cols(df)
    df = jkf_cols(df)
    df = newark_cols(df)
    return df

In [608]:
def add_cols(df):
    df = add_Time(df)
    df = add_locations(df)
    return df

## Choose Columns

In [609]:
def choose_predictor_cols(df, no_dist=False): 
    if no_dist:
        cols=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'year', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'total_seconds', 'morning_rush', 'night_charge', 'weekday_surcharge', 'manhattan', 'manhattan_one_way', 'jfk', 'newark', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
    else: 
        cols=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'year', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'total_seconds', 'morning_rush', 'night_charge', 'weekday_surcharge', 'manhattan', 'manhattan_one_way', 'jfk', 'newark', 'passenger_count','euclidean_distance', 'taxicab_distance', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
    X = df[cols]
    return X

## Min Max Scaler

In [610]:
def min_max_scaler(X):
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_df = pd.DataFrame(X_scaled, columns=X.columns)
    return X_df

## One Hot Encoder

In [611]:
def one_hot_cols(X):
    X = one_Hot_Encoder(X, X['month'])
    del X['month']
    X = one_Hot_Encoder(X, X['dayofweek'], month=False)
    del X['dayofweek']
    return X

In [612]:
def one_Hot_Encoder(X, col, month=True): 
    encoder = OneHotEncoder()
    hot_array = encoder.fit_transform(np.array(col).reshape(-1,1)).toarray()
    hot_df = pd.DataFrame(hot_array)
    if month:
        hot_df.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    else:
        hot_df.columns = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    new_df = X.join(hot_df)
    return new_df

## ML Tests

### Linear Regression

In [613]:
def linear_regression_split(X_train, y_train):
            
    y = y_train.median()
    mse = np.sum((y_train-y)**2)
    score = mse/len(y_train)
    rmse = np.sqrt(score)
    print('Lin reg train rmse:', rmse)
    print('Lin reg train mean:', rmse.mean())
    print('Lin reg train std:', rmse.std())
    
    return rmse

In [614]:
def linear_regression(X_train, y_train, distance_none=False, distance_high=False):
        
    print('Length of X:', len(X_train))
    lr_model = LinearRegression(fit_intercept=False)
    lr_model.fit(X_train, y_train)
    scores = cross_val_score(lr_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    print('Lin reg train rmse:', rmse)
    print('Lin reg train mean:', rmse.mean())
    print('Lin reg train std:', rmse.std())
    
    if distance_none:
        joblib.dump(lr_model, 'lr_distance_none_model.pkl')
        print('Linear Regression model saved as "lr_distance_none_model.pkl"')
    elif distance_high:
        joblib.dump(lr_model, 'lr_distance_high_model.pkl')
        print('Linear Regression model saved as "lr_distance_high_model.pkl"')
    else:
        joblib.dump(lr_model, 'lr_model.pkl') 
        print('Linear Regression model saved as "lr_model.pkl"')

    return lr_model

### Ridge

In [615]:
def ridge(X_train, y_train, distance_none=False, distance_high=False):
        
    print('Length of X:', len(X_train))
    ri_model = Ridge()
    ri_model.fit(X_train, y_train)
    scores = cross_val_score(ri_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    print('Lin reg train rmse:', rmse)
    print('Lin reg train mean:', rmse.mean())
    print('Lin reg train std:', rmse.std())
    
    if distance_none:
        joblib.dump(ri_model, 'ri_distance_none_model.pkl')
        print('Linear Regression model saved as "ri_distance_none_model.pkl"')
    elif distance_high:
        joblib.dump(ri_model, 'ri_distance_high_model.pkl')
        print('Linear Regression model saved as "ri_distance_high_model.pkl"')
    else:
        joblib.dump(ri_model, 'ri_model.pkl') 
        print('Linear Regression model saved as "ri_model.pkl"')

    return ri_model

### Random Forests

In [616]:
def random_random_forest_tuner(X_train, y_train):
        
    param_grid = [
        {'n_estimators': [75, 100, 250, 500, 750, 1000, 15000], 'max_features': [5, 10, 15, 20, 25]}, 
    ]
    
    forest_reg = RandomForestRegressor()
    
    forest_reg_tuned = RandomSearchCV(forest_reg, param_grid, n_iter=6, cv=3, 
                                    scoring='neg_mean_squared_error')
    
    forest_reg_tuned.fit(X,y)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(forest_reg_tuned.best_params_))
    
    scores = cross_val_score(forest_reg_tuned, X, y, scoring='neg_mean_squared_error', cv=3)
    
    display_scores('Random Forest', scores)
    
    return forest_reg_tuned

In [617]:
def random_forest_tuner(X_train, y_train):
        
    param_grid = [
        {'n_estimators': [100, 500, 1000], 'max_features': [10]}, 
    ]
    
    forest_reg = RandomForestRegressor()
    
    forest_reg_tuned = GridSearchCV(forest_reg, param_grid, cv=3, 
                                    scoring='neg_mean_squared_error')
    
    forest_reg_tuned.fit(X,y)
    
    # Print the tuned parameters and score
    print("Tuned Random Forest Parameters: {}".format(forest_reg_tuned.best_params_))
    
    scores = cross_val_score(forest_reg_tuned, X, y, scoring='neg_mean_squared_error', cv=3)
    
    display_scores('Random Forest', scores)
    
    return forest_reg_tuned

In [618]:
def display_scores(title, scores):
    rmse = np.sqrt(-scores)
    print(title, ' rmse scores:', rmse)
    print(title, ' mean score:', rmse.mean())
    print(title, ' std:', rmse.std())

In [619]:
def random_forest(X_train, y_train, distance_none=False, distance_high=False):
    
    rf_model = RandomForestRegressor(max_features=10, n_estimators=50)
    
    rf_model.fit(X_train, y_train)
    
    scores = cross_val_score(rf_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
    
    display_scores('Random Forest', scores)
    
    if distance_none:
        joblib.dump(rf_model, 'rf_distance_none_model.pkl')
        print('Linear Regression model saved as "rf_distance_none_model.pkl"')
    elif distance_high:
        joblib.dump(rf_model, 'rf_distance_high_model.pkl')
        print('Linear Regression model saved as "rf_distance_high_model.pkl"')
    else:
        joblib.dump(rf_model, 'rf_model.pkl') 
        print('Linear Regression model saved as "rf_model.pkl"')
        
    return rf_model

### Deep Learning (Sequential)

In [639]:
# keras_regression_test requires "from sklearn.model_selection import train_test_split"
def deep_learning(X_train, y_train, nodes=NODES, batch_size=32, activation='relu', optimizer='adam', loss='mean_squared_error', keras_distance_high=False, keras_distance_none=False):
        
    X, X_check, y, y_check = train_test_split(X_train, y_train, test_size=0.05)
    
    # Save the number of columns in predictors: n_cols
    n_cols = X.shape[1]

    # Set up the model: model
    model = Sequential()
    
    # Add the first layer
    model.add(Dense(nodes[0], activation=activation, input_shape=(n_cols,)))
    
    # Add addition layers
    for i in range(len(nodes)-1):
        model.add(Dense(nodes[i+1], activation=activation, kernel_constraint=maxnorm(3)))
        model.add(Dropout(0.1))

    # Add the output layer
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=optimizer, loss=loss)

    # Define early_stopping_monitor
    early_stopping_monitor = EarlyStopping(patience=3)

    # Fit the model
    model.fit(X, y, validation_split=0.05, epochs=1000, batch_size=batch_size, callbacks=[early_stopping_monitor])

    # Get score for predictions
    score = model.evaluate(X_check, y_check)
    
    # Get root mean squared error
    rmse = np.sqrt(score)
    
    # Return root mean squared error
    print(rmse)
    
    save_keras_model(model, keras_distance_high=keras_distance_high, keras_distance_none=keras_distance_none)
    
    return model

## Reset Index

In [621]:
def reset_index(X):
    X = X.reset_index(drop=True)
    return X

## Pipeline

In [622]:
def data_frame_split(df):
    
    df_distance_none = df[df['euclidean_distance']==0]
    print('New dataframe "df_distance_none" created with length:', len(df_distance_none))
    
    df_distance_high = df[df['euclidean_distance']>30]
    print('New dataframe "df_distance_high" created with length:', len(df_distance_high))

    df = df[df['euclidean_distance']>0]
    df = df[df['euclidean_distance']<=30]

    print('New length of original dataframe:', len(df))
    return df, df_distance_none, df_distance_high

In [623]:
def df_pipeline(df, no_dist=False):
    df = reset_index(df)
    df = add_cols(df)
    df = one_hot_cols(df)
    return df

In [624]:
def X_pipeline(X, no_dist=False):
    X = choose_predictor_cols(X, no_dist=no_dist)
    X = min_max_scaler(X)
    return X

In [625]:
def test_pipeline(test_set=False, max_scaler=True):
    df = file_to_dataFrame('test.csv')
    print('Length of test_df:)', len(df))
    df = add_distance(df)
    df = df_pipeline(df)
    df, df_distance_none, df_distance_high = data_frame_split(df)
    
    X_test, y_test = make_Xtest_ytest(df)
    X_test_distance_none, y_test_distance_none = make_Xtest_ytest(df_distance_none)
    X_test_distance_high, y_test_distance_high = make_Xtest_ytest(df_distance_high)
    
    X_test = X_pipeline(df)
    X_test_distance_none = X_pipeline(X_test_distance_none, no_dist=True)
    X_test_distance_high = X_pipeline(X_test_distance_high)
    
    return X_test, y_test, X_test_distance_none, y_test_distance_none, X_test_distance_high, y_test_distance_high

In [626]:
def pipeline():
    
    df = file_to_dataFrame('train.csv')
    df = row_elimination(df)
    df = df_pipeline(df)
    df, df_distance_none, df_distance_high = data_frame_split(df)
    
    X, y = make_X_y(df)
    X_distance_none, y_distance_none = make_X_y(df_distance_none)
    X_distance_high, y_distance_high = make_X_y(df_distance_high)
    
    X = X_pipeline(X)
    X_distance_none = X_pipeline(X_distance_none, no_dist=True)
    X_distance_high = X_pipeline(X_distance_high)
    
    return X, y, X_distance_none, y_distance_none, X_distance_high, y_distance_high

In [627]:
def save_keras_model(model, keras_distance_none=False, keras_distance_high=False):
    # serialize model to JSON
    model_json = model.to_json()
    
    if keras_distance_none:
        with open("dl_distance_none_model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("dl_distance_none_model.h5")
        print("Saved deep learning model as 'dl_distance_none_model.json'")
    
    elif keras_distance_high:
        with open("dl_distance_high_model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("dl_distance_high_model.h5")
        print("Saved deep learning model as 'dl_distance_high_model.json'")
    
    else:
        with open("dl_model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("dl_model.h5")
        print("Saved deep learning model as 'dl_model.json'")
    return model
  
def open_keras_model(file, keras_distance_none=False, keras_distance_high=False):
    # load json and create model
    json_file = open(file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    if keras_distance_none:
        loaded_model.load_weights("dl_distance_none_model.h5")
    elif keras_distance_high:
        loaded_model.load_weights("dl_distance_high_model.h5")
    else:
        loaded_model.load_weights("dl_model.h5")
    print("Loaded model from disk")
    return loaded_model

In [628]:
def open_model(saved_model, keras=False, keras_distance_none=False, keras_distance_high=False):
    if keras:
        model = open_keras_model(saved_model)
    elif keras_distance_none:
        model = open_keras_model(saved_model, keras_distance_none=keras_distance_none)
    elif keras_distance_high:
        model = open_keras_model(saved_model, keras_distance_high=keras_distance_high)
    else:
        model = joblib.load(saved_model)
    return model

def kaggle_submit(y_test, saved_model, saved_model_distance_none, saved_model_distance_high, keras=False, keras_distance_none=False, keras_distance_high=False):
    saved_model = open_model(saved_model, keras=keras)
    saved_model_distance_none = open_model(saved_model_distance_none, keras_distance_none=keras_distance_none)
    saved_model_distance_high = open_model(saved_model_distance_high, keras_distance_high=keras_distance_high)
        
    y_test['fare_amount'] = saved_model.predict(X_test)
    y_test_distance_none['fare_amount'] = saved_model_distance_none.predict(X_test_distance_none)
    y_test_distance_high['fare_amount'] = saved_model_distance_high.predict(X_test_distance_high)
    
    y_test = pd.concat([y_test,y_test_distance_none, y_test_distance_high])
    
    y_test.to_csv('my_submission.csv', index=False)
    #print(y_test)
    return y_test

In [629]:
X, y, X_distance_none, y_distance_none, X_distance_high, y_distance_high = pipeline()

Length of df: 150000
NaN dropped: 1
US Mainland Only dropped: 3088
NYC Taxis Only dropped: 179
Max Passengers 6 dropped: 540
Distance Columns added...
New dataframe "df_distance_none" created with length: 1524
New dataframe "df_distance_high" created with length: 23
New length of original dataframe: 144645


In [630]:
X_test, y_test, X_test_distance_none, y_test_distance_none, X_test_distance_high, y_test_distance_high = test_pipeline()

Length of test_df:) 9914
Distance Columns added...
New dataframe "df_distance_none" created with length: 85
New dataframe "df_distance_high" created with length: 3
New length of original dataframe: 9826


## Tests

### LR Test

In [631]:
linear_regression(X_distance_high, y_distance_high, distance_high=True)

Length of X: 23
Lin reg train rmse: [ 50.09166181  99.24755498 130.73293799 116.24477776 102.93735937]
Lin reg train mean: 99.85085838115526
Lin reg train std: 27.233551147597566
Linear Regression model saved as "lr_distance_high_model.pkl"


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [632]:
linear_regression(X_distance_none, y_distance_none, distance_none=True)

Length of X: 1524
Lin reg train rmse: [10.75368784 14.23409137 12.70426881 32.36891942 11.05286343]
Lin reg train mean: 16.22276617292958
Lin reg train std: 8.169108004208924
Linear Regression model saved as "lr_distance_none_model.pkl"


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [633]:
linear_regression(X, y)

Length of X: 144645
Lin reg train rmse: [4.00703791 4.2312971  4.23811799 3.96156074 4.20959174]
Lin reg train mean: 4.1295210972457825
Lin reg train std: 0.11981299089025096
Linear Regression model saved as "lr_model.pkl"


LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [634]:
ridge(X,y)

Length of X: 144645
Lin reg train rmse: [4.00935679 4.23126438 4.2410177  3.9619879  4.21230153]
Lin reg train mean: 4.1311856599802494
Lin reg train std: 0.12010725998495231
Linear Regression model saved as "ri_model.pkl"


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [635]:
ridge(X_distance_none, y_distance_none, distance_none=True)

Length of X: 1524
Lin reg train rmse: [10.78924625 14.25718165 12.6087368  32.21592414 11.03182682]
Lin reg train mean: 16.180583132890966
Lin reg train std: 8.113765164910333
Linear Regression model saved as "ri_distance_none_model.pkl"


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [636]:
ridge(X_distance_high, y_distance_high, distance_high=True)

Length of X: 23
Lin reg train rmse: [ 20.42250476  55.2327055   59.77037005 111.44322998  66.35027968]
Lin reg train mean: 62.64381799387196
Lin reg train std: 29.123670961938718
Linear Regression model saved as "ri_distance_high_model.pkl"


Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [637]:
#kaggle_submit(y_test, 'ri_model.pkl', 'ri_distance_none_model.pkl', 'lr_distance_high_model.pkl')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,11.831175
1,2015-01-27 13:08:24.0000003,12.076457
2,2011-10-08 11:53:44.0000002,7.088711
3,2012-12-01 21:12:12.0000002,9.404295
4,2012-12-01 21:12:12.0000003,15.789187
5,2012-12-01 21:12:12.0000005,11.876712
6,2011-10-06 12:10:20.0000001,8.425235
7,2011-10-06 12:10:20.0000003,55.276801
8,2011-10-06 12:10:20.0000002,13.497633
9,2014-02-18 15:22:20.0000002,9.469924


### keras Tests

In [641]:
dl_model_distance = deep_learning(X_distance_none, y_distance_none, keras_distance_none=True)

Train on 1374 samples, validate on 73 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
12.076043737298448
Saved deep learning model as 'dl_distance_none_model.json'


In [642]:
dl_model_distance = deep_learning(X_distance_high, y_distance_high, keras_distance_high=True)

Train on 19 samples, validate on 2 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoc

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
29.805227100292324
Saved deep learning model as 'dl_distance_high_model.json'


In [643]:
dl_model = deep_learning(X, y)

Train on 130541 samples, validate on 6871 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
3.810530707506849
Saved deep learning model as 'dl_model.json'


In [644]:
kaggle_submit(y_test, 'dl_model.json', 'dl_distance_none_model.json', 'dl_distance_high_model.json', keras=True, keras_distance_none=True, keras_distance_high=True)

Loaded model from disk
Loaded model from disk
Loaded model from disk


Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,14.872206
1,2015-01-27 13:08:24.0000003,15.555851
2,2011-10-08 11:53:44.0000002,7.803155
3,2012-12-01 21:12:12.0000002,11.386860
4,2012-12-01 21:12:12.0000003,18.263371
5,2012-12-01 21:12:12.0000005,14.453194
6,2011-10-06 12:10:20.0000001,9.671371
7,2011-10-06 12:10:20.0000003,58.028358
8,2011-10-06 12:10:20.0000002,16.559309
9,2014-02-18 15:22:20.0000002,10.938480


### RF Test

In [506]:
random_forest(X_distance_none, y_distance_none, distance_none=True)

Random Forest  rmse scores: [13.95366771 10.9274019  12.08455299 11.87294842 10.76575251]
Random Forest  mean score: 11.920864707032875
Random Forest  std: 1.1386509922108163
Linear Regression model saved as "rf_distance_none_model.pkl"


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [507]:
random_forest(X_distance_high, y_distance_high, distance_high=True)

Random Forest  rmse scores: [ 52.23445375  50.11460938 103.85570503  37.84936334  39.21845223]
Random Forest  mean score: 56.6545167464469
Random Forest  std: 24.281308288210067
Linear Regression model saved as "rf_distance_high_model.pkl"


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [508]:
random_forest(X, y)

Random Forest  rmse scores: [3.56023513 3.51308453 3.96715795 3.8646105  3.61284635]
Random Forest  mean score: 3.7035868917605996
Random Forest  std: 0.1791496768716599
Linear Regression model saved as "rf_model.pkl"


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [443]:
rf = open_model('rf_model.pkl')
feature_importances = rf.feature_importances_
sorted(zip(feature_importances, list(X)), reverse=True)

[(0.34330165502726584, 'euclidean_distance'),
 (0.23897415208929706, 'taxicab_distance'),
 (0.08441907303610358, 'manhattan'),
 (0.06531669279188136, 'jfk'),
 (0.05976043110506831, 'pickup_longitude'),
 (0.055038741636194814, 'dropoff_longitude'),
 (0.03242513721193783, 'pickup_latitude'),
 (0.03219713258788817, 'dropoff_latitude'),
 (0.02303074019291758, 'year'),
 (0.016137066545245386, 'total_seconds'),
 (0.008105294794532845, 'newark'),
 (0.004474721982032309, 'manhattan_one_way'),
 (0.003316634099154293, 'night_charge'),
 (0.0032091701562175657, 'passenger_count'),
 (0.0021644301276920706, 'Sat'),
 (0.0019202045722021012, 'Oct'),
 (0.0018882711467735345, 'Sep'),
 (0.0018503660865519366, 'Fri'),
 (0.0018303735799151438, 'Wed'),
 (0.0018211300914307976, 'Thu'),
 (0.0015782242219240872, 'Tue'),
 (0.0015314676243699836, 'Jul'),
 (0.0015131065067303037, 'Mar'),
 (0.0015025134376371431, 'weekday_surcharge'),
 (0.0013360816663071375, 'Sun'),
 (0.0013082388107069373, 'Dec'),
 (0.0012987176

In [None]:
# cuda
# p42xlarge
# ec2 instance pricing
# make sure you have gpu and optimization
# save as pickle file