## Import Libraries

In [800]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

## Clean Data

In [801]:
def clean_data(df):
    df = df.dropna(axis=0, subset=['dropoff_latitude'])
    df = df.drop('key', axis=1)
    return df

In [802]:
def lat_lon_US(df):
    # Choose cab rides whose pickup and dropoff are the US Mainland
    # Declare constants
    latmin = 5.496100
    latmax = 71.538800
    longmin = -124.482003
    longmax = -66.885417

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) & ((((df['dropoff_longitude']<=longmax) & (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    
    return df

In [803]:
def lat_lon_NYC(df):
    # Find cab rides whose pickup or dropoff are within NYC boundaries
    # Declare constants
    latmin = 40.477399
    latmax = 40.917577
    longmin = -74.259090
    longmax = -73.700272

    # Create dataframe with correct coordinates
    df = df[((((df['pickup_longitude']<=longmax) & (df['pickup_longitude']>=longmin)) & ((df['pickup_latitude']<=latmax) & (df['pickup_latitude']>=latmin)))) | ((((df['dropoff_longitude']<=longmax) % (df['dropoff_longitude']>=longmin)) & ((df['dropoff_latitude']<=latmax) & (df['dropoff_latitude']>=latmin))))]
    return df

In [804]:
def max_Riders(df, num):
    # Only choose cabs between 1 and num riders
    df = df[(df['passenger_count'] <= num) & (df['passenger_count'] > 0)]
    return df

In [805]:
def add_distance(df):

    # Define coordinates (x,y)
    x1 = df['pickup_latitude']
    y1 = df['pickup_longitude']
    x2 = df['dropoff_latitude']
    y2 = df['dropoff_longitude']

    # Create Euclidean Distrance column
    df['euclidean_distance'] = np.sqrt((y2-y1)**2 + (x2-x1)**2)

    # Create Taxicab Distance column
    df['taxicab_distance'] = np.abs(y2-y1) + np.abs(x2-x1)

    # Convert to miles
    df['euclidean_distance'] = df['euclidean_distance'] * 69
    df['taxicab_distance'] = df['taxicab_distance'] * 69
    
    return df

In [806]:
def min_Fare(df):
    # Eliminate unrealistic plots
    df = df[df['fare_amount'] >= (df['euclidean_distance'] * 2 + 2.5)]
    return df

In [807]:
def max_Fare(df):
    df = df[(df['fare_amount'] <= (df['taxicab_distance'] * 48 + 16)) | (df['fare_amount'] <= 56)]
    return df

In [808]:
def no_distance(df):
    # Elminate fares that traveled no distance
    df = df[df['euclidean_distance']>0]
    return df

In [809]:
def row_elimination(df):
    df = clean_data(df)
    df = lat_lon_US(df)
    df = lat_lon_NYC(df)
    df = max_Riders(df, 8)
    df = add_distance(df)
    df = min_Fare(df)
    df = max_Fare(df)
    df = no_distance(df)
    return df

In [810]:
# Get rid of accumulated garbage
import gc
gc.collect()

18

## Add Attributes

### Time

In [811]:
def add_Time_units(df):
    
    df['month'] = df['pickup_datetime'].dt.month
    df['year'] = df['pickup_datetime'].dt.year
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute
    df['second'] = df['pickup_datetime'].dt.second
    df['dayofweek'] = df['pickup_datetime'].dt.dayofweek
    
    df = df.drop('pickup_datetime', axis=1)

    df['15_min_intervals'] = 4 * df['hour'] + (df['minute']/15).astype(int)
    df['total_seconds'] = 3600 * df['hour'] + 60 * df['minute'] + df['second']
    
    return df

In [812]:
def add_Time_columns(df):

    def summer_month(row):
        if row['month'] in [6,7,8]:
            return 1
        else:
            return 0
    
    df['summer_month'] = df.apply(summer_month, axis=1)
    
    
    def cold_month(row):
        if row['month'] in [1,2,3,11,12]:
            return 1
        else:
            return 0
    
    df['cold_month'] = df.apply(cold_month, axis=1)
    
    def weekend(row):
        if row['dayofweek'] in [5,6]:
            return 1
        else:
            return 0

    df['weekend'] = df.apply(weekend, axis=1)

    def rush_hour(row):
        if (row['hour'] in [7,8,9,15,16,17,18,19]) & (row['weekend'] == 0):
            return 1
        else:
            return 0

    df['rush_hour'] = df.apply(rush_hour, axis=1)

    def night_rush(row):
        if (row['hour'] in [19,20,21,22,23,24,1]) & (row['dayofweek'] in [3,4,5]):
            return 1
        else:
            return 0

    df['night_rush'] = df.apply(night_rush, axis=1)

    def night_charge(row):
        if row['hour'] in [20,21,22,23,24,1,2,3,4,5,6]:
            return 1
        else:
            return 0

    df['night_charge'] = df.apply(night_charge, axis=1)

    def weekday_surcharge(row):
        if (row['hour'] in [16,17,18,19,20]) & (row['dayofweek'] in [1,2,3,4,5]):
            return 1
        else:
            return 0

    df['weekday_surcharge'] = df.apply(weekday_surcharge, axis=1)
    
    return df

In [813]:
def add_Time(df):
    df = add_Time_units(df)
    df = add_Time_columns(df)
    return df

### Manhattan

In [814]:
# Define line from two points and a provided column
def two_points_line(a, b, column):
        
    # Case when y-values are the same
    if b[1]==a[1]:
        
        # Slope defaults to 0
        slope = 0
        
    # Case when x-values are the same
    elif b[0]==a[0]:
        
        # Case when max value is less than 999999999
        if column.max() < 999999999:
            
            # Add 999999999 to max value
            slope = column.max() + 999999999
        
        # All other cases
        else:
            
            # Multiply max value by itself (greater than 999999999)
            slope = column.max() * column.max()
    
    # When x-values and y-values are not 0
    else:
        
        # Use standard slope formula
        slope = (b[1] - a[1])/(b[0]-a[0])
    
    
    # Equation for y-intercept (solving y=mx+b for b)
    y_int = a[1] - slope * a[0]
    
    # Return slope and y-intercept
    return slope, y_int

In [815]:
def manhattan_cols(df):
    
    upper_right = (-73.929224, 40.804328)
    bottom_right = (-73.980036, 40.710706)
    bottom_left = (-74.054880, 40.681292)
    upper_left = (-73.966303, 40.830050)

    m_top, b_top = two_points_line(upper_right, upper_left, df.pickup_latitude)
    m_left, b_left = two_points_line(bottom_left, upper_left, df.pickup_latitude)
    m_right, b_right = two_points_line(bottom_right, upper_right, df.pickup_latitude)
    m_bottom, b_bottom = two_points_line(bottom_right, bottom_left, df.pickup_latitude)

    def manhattan(row):
        if (((row['pickup_latitude'] <= (row['pickup_longitude'] * m_top + b_top)) &
        (row['pickup_latitude'] >= (row['pickup_longitude'] * m_bottom + b_bottom))) &
        ((row['pickup_latitude'] >= (row['pickup_longitude'] * m_right + b_right)) &
        (row['pickup_latitude'] <= (row['pickup_longitude'] * m_left + b_left)))) & (((row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_top + b_top)) &
        (row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_bottom + b_bottom))) &
        ((row['dropoff_latitude'] >= (row['dropoff_longitude'] * m_right + b_right)) &
        (row['dropoff_latitude'] <= (row['dropoff_longitude'] * m_left + b_left)))):
            return 1
        else:
            return 0
        
    df['manhattan'] = df.apply(manhattan, axis=1)
        
    return df

In [816]:
def add_cols(df):
    df = add_Time(df)
    df = manhattan_cols(df)
    return df

In [817]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, new_adds=False):
        self.new_adds = new_adds
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = add_Time(X)
        X = manhattan_cols(X)
        if self.new_adds:
            X = lat_lon_correct(X)
        
        return X

In [818]:
atrr_adder = CombinedAttributesAdder()

## Run Tests

In [819]:
def pipeline(num_rows):

    df = pd.read_csv('train.csv', nrows=num_rows, parse_dates=['pickup_datetime'])
    df = row_elimination(df)
    
    y = df['fare_amount'].copy()
    X = df.drop('fare_amount', axis=1)
    
    num_pipeline = Pipeline([('attr_adder', CombinedAttributesAdder()), ('min_max_scaler', MinMaxScaler())])
    X = num_pipeline.fit_transform(X)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X, y)
    scores = cross_val_score(lin_reg, X, y, scoring='mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    print('rmse:', rmse)

In [820]:
pipeline(100000)

rmse: [3.55079782 3.68441887 3.81361356 3.81605061 3.71998206]


In [823]:
def test_pipeline(test_file, num_rows=100000):
    
    # TRAIN SET
    
    df = pd.read_csv('train.csv', nrows=num_rows, parse_dates=['pickup_datetime'])
    df = row_elimination(df)
    
    y_train = df['fare_amount'].copy()
    X_train = df.drop('fare_amount', axis=1)
    
    num_pipeline = Pipeline([('attr_adder', CombinedAttributesAdder()), ('min_max_scaler', MinMaxScaler())])
    X_train = num_pipeline.fit_transform(X_train)
    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    scores = cross_val_score(lin_reg, X_train, y_train, scoring='mean_squared_error', cv=5)
    rmse = np.sqrt(-scores)
    print('rmse:', rmse)
    
    # TEST SET
    
    df_test = pd.read_csv(test_file, parse_dates=['pickup_datetime'])
    X_test = df_test.copy()
    X_test = X_test.drop('key', axis=1)
    X_test = add_distance(X_test)
    
    #num_pipeline = Pipeline([('attr_adder', CombinedAttributesAdder()), ('min_max_scaler', MinMaxScaler())])
    #X_test = num_pipeline.fit_transform(X_test)
    
    X_test = atrr_adder.transform(X_test)    

    fare_predictions = lin_reg.predict(X_test)
    
    df_test['fare_amount'] = fare_predictions
    df_test = df_test[['key', 'fare_amount']]
    
    df_test.to_csv('my_submission.csv', index=False)
    
    return df_test

In [None]:
test_pipeline('test.csv')

In [686]:
test_pipeline('test.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.141684
1,2015-01-27 13:08:24.0000003,10.405184
2,2011-10-08 11:53:44.0000002,4.904974
3,2012-12-01 21:12:12.0000002,7.693347
4,2012-12-01 21:12:12.0000003,14.695534
5,2012-12-01 21:12:12.0000005,10.226992
6,2011-10-06 12:10:20.0000001,6.457528
7,2011-10-06 12:10:20.0000003,56.158365
8,2011-10-06 12:10:20.0000002,12.069480
9,2014-02-18 15:22:20.0000002,7.545644


In [655]:
pipeline('test.csv')

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.118120
1,2015-01-27 13:08:24.0000003,10.335703
2,2011-10-08 11:53:44.0000002,5.040636
3,2012-12-01 21:12:12.0000002,7.682327
4,2012-12-01 21:12:12.0000003,14.670165
5,2012-12-01 21:12:12.0000005,10.216998
6,2011-10-06 12:10:20.0000001,6.490524
7,2011-10-06 12:10:20.0000003,55.810411
8,2011-10-06 12:10:20.0000002,11.989694
9,2014-02-18 15:22:20.0000002,7.479397


In [650]:
df_tester.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.11812
1,2015-01-27 13:08:24.0000003,10.335703
2,2011-10-08 11:53:44.0000002,5.040636
3,2012-12-01 21:12:12.0000002,7.682327
4,2012-12-01 21:12:12.0000003,14.670165


In [651]:
# cuda
# p42xlarge
# ec2 instance pricing
# make sure you have gpu and optimization
# save as pickle file