In [None]:
# Imports
# Packages for numerics + dataframes
import pandas as pd
import numpy as np

# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for date conversions for calculating trip durations
from datetime import datetime
from datetime import date
from datetime import timedelta

# Packages for OLS, MLR, confusion matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics # For confusion matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/My Drive/Advanced Data Analytics Certificate/Activity Datasets/2017_Yellow_Taxi_Trip_Data.csv'
df0=pd.read_csv(file_path)

In [None]:
df = df0.copy()

In [None]:
# Remove the target column from the features
X = df.drop(columns=['fare_amount'])

# Set y variable
y = df[['fare_amount']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
X_test.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

Since we are pretending X_test and y_test are future use-cases, where a user is using the app to call a taxi and the app will provide an estimate of fare amount, after the user input some necessary infomation, such as passenger count, and desination, some of the columns in X_test need to be removed because they are unknown at the time when the app is used. For example, 'trip_distance' and 'tpep_dropoff_datetime' need to be removed.

Assumption 1: We can use 'tpep_pickup_datetime' as the time when the app is used to book the trip. This time can be used to determine if it is in rush hour.

In [None]:
X_test = X_test.drop(['Unnamed: 0', 'tpep_dropoff_datetime', 'trip_distance',
               'store_and_fwd_flag',
               'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
               'total_amount'
               ], axis=1)

In [None]:
X_test.columns

Index(['VendorID', 'tpep_pickup_datetime', 'passenger_count', 'RatecodeID',
       'PULocationID', 'DOLocationID'],
      dtype='object')

Assumption 2: Since X_test is used to simulate the actual use-cases, only the following columns are deemed to be available.
* 'VendorID',
* 'tpep_pickup_datetime',
* 'passenger_count',
* 'RatecodeID',
* 'PULocationID',
* 'DOLocationID'




Assumption 3: (Time line)

At this moment, we known X_train and y_train. We will know X_test after we build the model.

Note: model include lr-model, all assumptions, all necessary tables, all parameters, and rules used in the prediction process.

We will not know y_test until the future trips are done. This means that we will know y_test after we computed y_test_pred.


In [None]:
X_train.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

In [None]:
y_train.columns

Index(['fare_amount'], dtype='object')

In [None]:
# Convert `tpep_pickup_datetime` to datetime format
X_train['tpep_pickup_datetime'] = pd.to_datetime(X_train['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')

# Convert `tpep_dropoff_datetime` to datetime format
X_train['tpep_dropoff_datetime'] = pd.to_datetime(X_train['tpep_dropoff_datetime'], format='%m/%d/%Y %I:%M:%S %p')

In [None]:
# Create `duration` column
X_train['duration'] = (X_train['tpep_dropoff_datetime'] - X_train['tpep_pickup_datetime'])/np.timedelta64(1,'m')

In [None]:
# Impute values less than $0 with 0
y_train.loc[y_train['fare_amount'] < 0, 'fare_amount'] = 0

The above is a rule for the model.

Rule 1: the fare amount cannot be less than zero. This rule is not useful for estimating fare amount. However, it can be useful when comparing the predicted fare amount with the true amount.

In [None]:
def compute_upper_threshold(df, col, iqr_factor):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    upper_threshold = q3 + (iqr_factor * iqr)
    return upper_threshold

In [None]:
def outlier_imputer(df, col, upper_threshold):
    # Reassign values > threshold to threshold
    df.loc[df[col] > upper_threshold, col] = upper_threshold
    print(df[col].describe())
    print()

In [None]:
fare_am_up_threshold = compute_upper_threshold(y_train, 'fare_amount', 6)
fare_am_up_threshold

62.5

Rule 2: fare amount is less or equal to fare_am_up_threshold (62.5). This rule is not useful for estimating fare amount. However, it can be useful when comparing the predicted fare amount with the true amount.

In [None]:
outlier_imputer(y_train, 'fare_amount', fare_am_up_threshold)

count    18159.000000
mean        12.891746
std         10.569186
min          0.000000
25%          6.500000
50%          9.500000
75%         14.500000
max         62.500000
Name: fare_amount, dtype: float64



In [None]:
# Impute a 0 for any negative values
X_train.loc[X_train['duration'] < 0, 'duration'] = 0

Rule 3: 'duration' >= 0. This rule does not apply to real-cases, since so-called 'duration' is not available when user is using the app to get an estimate for the fare amount.

In [None]:
duration_threshold = compute_upper_threshold(X_train, 'duration', 6)
duration_threshold

88.10833333333332

Rule 4: 'duration' <= duration_threshold. This rule does not apply to real-cases, since so-called 'duration' is not available when user is using the app to get an estimate for the fare amount.

In [None]:
outlier_imputer(X_train, 'duration', duration_threshold)

count    18159.000000
mean        14.412935
std         11.908594
min          0.000000
25%          6.616667
50%         11.150000
75%         18.258333
max         88.108333
Name: duration, dtype: float64



In [None]:
X_train.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'duration'],
      dtype='object')

In [None]:
# Create `pickup_dropoff` column
X_train['pickup_dropoff'] = X_train['PULocationID'].astype(str) + ' ' + X_train['DOLocationID'].astype(str)

Rule 5: Create a concatenated column "pickup_dropoff" with PULocationID and DOLocationID. The column will be used to map into two values, one is related to distance and one is related to duration.

In [None]:
def create_dict_pu_do_to_distance(df):
    grouped = df.groupby('pickup_dropoff').mean(numeric_only=True)[['trip_distance']]
    grouped_dict = grouped.to_dict()
    grouped_dict = grouped_dict['trip_distance']
    return grouped_dict

In [None]:
def create_dict_pu_do_to_duration(df):
    grouped = df.groupby('pickup_dropoff').mean(numeric_only=True)[['duration']]
    grouped_dict = grouped.to_dict()
    grouped_dict = grouped_dict['duration']
    return grouped_dict

In [None]:
def apply_dict(df, new_col, dict_name):
    df[new_col] = df['pickup_dropoff']
    df[new_col] = df[new_col].map(dict_name)

In [None]:
dict_pu_do_to_distance = create_dict_pu_do_to_distance(X_train)
dict_pu_do_to_duration = create_dict_pu_do_to_duration(X_train)

Dictionary 1: apply dict_pu_do_to_distance to get a distance estimate from pickup_dropoff.

Dictionary 2: apply dict_pu_do_to_duration to get a duration estimate from pickup_dropoff.

Note if a dictionary happen to not have a particular pickup_dropoff value, the dictionary would return NaN.

    * when this case happens, we’ll use a pair of default values: default_distance and default_duration


In [None]:
default_distance = X_train['trip_distance'].mean()
default_duration = X_train['duration'].mean()

In [None]:
apply_dict(X_train, 'distance_estimate', dict_pu_do_to_distance)
apply_dict(X_train, 'duration_estimate', dict_pu_do_to_duration)

In [None]:
X_train.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'duration', 'pickup_dropoff',
       'distance_estimate', 'duration_estimate'],
      dtype='object')

In [None]:
#def mean_calculator(df, target_column, mean_column):
#    grouped = df.groupby('pickup_dropoff').mean(numeric_only=True)[[target_column]]
#    grouped_dict = grouped.to_dict()
#    grouped_dict = grouped_dict[target_column]
#    df[mean_column] = df['pickup_dropoff']
#    df[mean_column] = df[mean_column].map(grouped_dict)

In [None]:
#mean_calculator(X_train, 'trip_distance', 'mean_distance')
#mean_calculator(X_train, 'duration', 'mean_duration')

In [None]:
#grouped = df.groupby('pickup_dropoff').mean(numeric_only=True)[['trip_distance']]

In [None]:
# 1. Convert `grouped` to a dictionary
#grouped_dict = grouped.to_dict()

# 2. Reassign to only contain the inner dictionary
#grouped_dict = grouped_dict['trip_distance']

In [None]:
# 1. Create a mean_distance column that is a copy of the pickup_dropoff helper column
#df['mean_distance'] = df['pickup_dropoff']

# 2. Map `grouped_dict` to the `mean_distance` column
#df['mean_distance'] = df['mean_distance'].map(grouped_dict)

In [None]:
#grouped = df.groupby('pickup_dropoff').mean(numeric_only=True)[['duration']]

# Create a dictionary where keys are unique pickup_dropoffs and values are
# mean trip duration for all trips with those pickup_dropoff combos
#grouped_dict = grouped.to_dict()
#grouped_dict = grouped_dict['duration']

#df['mean_duration'] = df['pickup_dropoff']
#df['mean_duration'] = df['mean_duration'].map(grouped_dict)

In [None]:
# Create 'day' col
X_train['day'] = X_train['tpep_pickup_datetime'].dt.day_name().str.lower()

# Create 'month' col
#df['month'] = df['tpep_pickup_datetime'].dt.strftime('%b').str.lower()

In [None]:
# Create 'rush_hour' col
X_train['rush_hour'] = X_train['tpep_pickup_datetime'].dt.hour

# If day is Saturday or Sunday, impute 0 in `rush_hour` column
X_train.loc[X_train['day'].isin(['saturday', 'sunday']), 'rush_hour'] = 0

In [None]:
def rush_hourizer(hour):
    if 6 <= hour['rush_hour'] < 10:
        val = 1
    elif 16 <= hour['rush_hour'] < 20:
        val = 1
    else:
        val = 0
    return val

In [None]:
# Apply the `rush_hourizer()` function to the new column
X_train.loc[(X_train.day != 'saturday') & (X_train.day != 'sunday'), 'rush_hour'] = X_train.apply(rush_hourizer, axis=1)

  X_train.loc[(X_train.day != 'saturday') & (X_train.day != 'sunday'), 'rush_hour'] = X_train.apply(rush_hourizer, axis=1)


Computation 1: Compute a rush_hour column based on the pickup datetime (or reservation datetime) in the four steps:

* create a column "day" based on the reservation datetime
* create a column "rush_hour" filled the hour field of the reservation datetime
* set rush_hour to zero for saturday and sunday
* for weekdays, set rush_hour to 1 when hour is in \[6am, 10am\] or \[4pm, 8pm\] -- function rush_hourizer()
  * use apply (see above code block)

In [None]:
X_train.columns

Index(['Unnamed: 0', 'VendorID', 'tpep_pickup_datetime',
       'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
       'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
       'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount', 'duration', 'pickup_dropoff',
       'distance_estimate', 'duration_estimate', 'day', 'rush_hour'],
      dtype='object')

In [None]:
#df2 = df.copy()

X_train = X_train.drop(['Unnamed: 0', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'trip_distance',
               'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID',
               'payment_type', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
               'duration', 'pickup_dropoff',
               'total_amount', 'day'
               ], axis=1)

In [None]:
X_train.columns

Index(['VendorID', 'passenger_count', 'distance_estimate', 'duration_estimate',
       'rush_hour'],
      dtype='object')

Rule 6: The model uses five columns to compute the estimated fare amount.
  * 'VendorID', (one-hot encoded to "2" = true and "1" = false)
  * 'passenger_count',
  * 'distance_estimate', (created with dict_pu_do_to_distance)
  * 'duration_estimate', (cretaed with dict_pudo_to_duration)
  * 'rush_hour'

In [None]:
# Remove the target column from the features
#X = df2.drop(columns=['fare_amount'])

# Set y variable
#y = df2[['fare_amount']]

In [None]:
#X.columns

In [None]:
# Convert VendorID to string
X_train['VendorID'] = X_train['VendorID'].astype(str)

# Get dummies
X_train = pd.get_dummies(X_train, drop_first=True)

In [None]:
X_train.head()

Unnamed: 0,passenger_count,distance_estimate,duration_estimate,rush_hour,VendorID_2
252,1,6.02,16.216667,0,True
16220,1,0.807273,7.781818,1,True
9795,2,1.184615,8.839744,0,False
1972,1,0.854444,5.762963,1,False
21092,1,0.898125,6.4125,1,False


In [None]:
scaler_X = StandardScaler().fit(X_train)

Computation 2: use scaler_X to scale each of the five variables as listed in Rule 6. For example, X_train_scaled = scaler_X.transform(X_train)

In [None]:
#scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler_X.transform(X_train)
lr=LinearRegression()
lr.fit(X_train_scaled, y_train)

In [None]:
lr.coef_

array([[ 0.02409519,  7.13845388,  2.82308513,  0.10343309, -0.05312336]])

In [None]:
# prompt: display all coefficients and the corresponding variables for model lr

# Get the coefficients of the linear regression model
coefficients = lr.coef_

# Get the names of the features
feature_names = X_train.columns

# Print the coefficients with their corresponding feature names
for feature, coef in zip(feature_names, coefficients[0]):
    print(f"{feature}: {coef:.2f}")


passenger_count: 0.02
distance_estimate: 7.14
duration_estimate: 2.82
rush_hour: 0.10
VendorID_2: -0.05


In [None]:
# prompt: display the intercept for model lr

print(lr.intercept_)


[12.89174624]


Computation 3: Use lr.predict() to compute predictions. For example, y_pred_train = lr.predict(X_train_scaled), where X_train_scaled is computed with Computation 2.

In [None]:
y_pred_train = lr.predict(X_train_scaled)
print('R^2:', r2_score(y_train, y_pred_train))
print('MAE:', mean_absolute_error(y_train, y_pred_train))
print('MSE:', mean_squared_error(y_train, y_pred_train))
print('RMSE:',np.sqrt(mean_squared_error(y_train, y_pred_train)))

R^2: 0.8401767050159512
MAE: 2.1569991441623726
MSE: 17.852508813635207
RMSE: 4.225222930643448


In [None]:
X_train.columns

Index(['passenger_count', 'distance_estimate', 'duration_estimate',
       'rush_hour', 'VendorID_2'],
      dtype='object')

In [None]:
X_test.columns

Index(['VendorID', 'tpep_pickup_datetime', 'passenger_count', 'RatecodeID',
       'PULocationID', 'DOLocationID'],
      dtype='object')

In [None]:
# Create `pickup_dropoff` column in X_test. This is Rule 5.
X_test['pickup_dropoff'] = X_test['PULocationID'].astype(str) + ' ' + X_test['DOLocationID'].astype(str)

In [None]:
X_test.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,passenger_count,RatecodeID,PULocationID,DOLocationID,pickup_dropoff
5818,2,12/03/2017 1:57:53 AM,1,1,148,162,148 162
18134,2,06/15/2017 10:05:54 AM,1,1,100,231,100 231
4655,2,02/24/2017 11:14:19 AM,2,1,262,236,262 236


In [None]:
# applying dictionaries. This is for "Dictionaries" in model information.
apply_dict(X_test, 'distance_estimate', dict_pu_do_to_distance)
apply_dict(X_test, 'duration_estimate', dict_pu_do_to_duration)

In [None]:
X_test.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,passenger_count,RatecodeID,PULocationID,DOLocationID,pickup_dropoff,distance_estimate,duration_estimate
5818,2,12/03/2017 1:57:53 AM,1,1,148,162,148 162,2.8125,12.141667
18134,2,06/15/2017 10:05:54 AM,1,1,100,231,100 231,3.41,19.138889
4655,2,02/24/2017 11:14:19 AM,2,1,262,236,262 236,0.901429,7.55


In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4540 entries, 5818 to 19601
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   VendorID              4540 non-null   int64  
 1   tpep_pickup_datetime  4540 non-null   object 
 2   passenger_count       4540 non-null   int64  
 3   RatecodeID            4540 non-null   int64  
 4   PULocationID          4540 non-null   int64  
 5   DOLocationID          4540 non-null   int64  
 6   pickup_dropoff        4540 non-null   object 
 7   distance_estimate     4150 non-null   float64
 8   duration_estimate     4150 non-null   float64
dtypes: float64(2), int64(5), object(2)
memory usage: 483.7+ KB


In [None]:
# prompt: replace NaNs in column "distance_estimate" with default_distance,
# and replace NaNs in column "duration_estimate" with default_duration
# this is for the note in "Dictionaries" section of the model information

X_test['distance_estimate'] = X_test['distance_estimate'].fillna(default_distance)
X_test['duration_estimate'] = X_test['duration_estimate'].fillna(default_duration)


In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4540 entries, 5818 to 19601
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   VendorID              4540 non-null   int64  
 1   tpep_pickup_datetime  4540 non-null   object 
 2   passenger_count       4540 non-null   int64  
 3   RatecodeID            4540 non-null   int64  
 4   PULocationID          4540 non-null   int64  
 5   DOLocationID          4540 non-null   int64  
 6   pickup_dropoff        4540 non-null   object 
 7   distance_estimate     4540 non-null   float64
 8   duration_estimate     4540 non-null   float64
dtypes: float64(2), int64(5), object(2)
memory usage: 483.7+ KB


In [None]:
# this is for Computation 1 (rush_hour)
X_test['tpep_pickup_datetime'] = pd.to_datetime(X_test['tpep_pickup_datetime'], format='%m/%d/%Y %I:%M:%S %p')

X_test['day'] = X_test['tpep_pickup_datetime'].dt.day_name().str.lower()

X_test['rush_hour'] = X_test['tpep_pickup_datetime'].dt.hour

X_test.loc[X_test['day'].isin(['saturday', 'sunday']), 'rush_hour'] = 0

X_test.loc[(X_test.day != 'saturday') & (X_test.day != 'sunday'), 'rush_hour'] = X_test.apply(rush_hourizer, axis=1)

  X_test.loc[(X_test.day != 'saturday') & (X_test.day != 'sunday'), 'rush_hour'] = X_test.apply(rush_hourizer, axis=1)


In [None]:
X_test.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,passenger_count,RatecodeID,PULocationID,DOLocationID,pickup_dropoff,distance_estimate,duration_estimate,day,rush_hour
5818,2,2017-12-03 01:57:53,1,1,148,162,148 162,2.8125,12.141667,sunday,0
18134,2,2017-06-15 10:05:54,1,1,100,231,100 231,3.41,19.138889,thursday,0
4655,2,2017-02-24 11:14:19,2,1,262,236,262 236,0.901429,7.55,friday,0


In [None]:
# This is for part of rule 6.
X_test = X_test.drop(['tpep_pickup_datetime', 'RatecodeID', 'PULocationID', 'DOLocationID',
                      'pickup_dropoff', 'day'
                    ], axis=1)

In [None]:
X_test.head(3)

Unnamed: 0,VendorID,passenger_count,distance_estimate,duration_estimate,rush_hour
5818,2,1,2.8125,12.141667,0
18134,2,1,3.41,19.138889,0
4655,2,2,0.901429,7.55,0


In [None]:
# This is for the rest of rule 6
# Convert VendorID to string
X_test['VendorID'] = X_test['VendorID'].astype(str)

# Get dummies
X_test = pd.get_dummies(X_test, drop_first=True)

In [None]:
X_test.columns

Index(['passenger_count', 'distance_estimate', 'duration_estimate',
       'rush_hour', 'VendorID_2'],
      dtype='object')

In [None]:
# this is for Computation 2.
X_test_scaled = scaler_X.transform(X_test)

In [None]:
# this is for Computation 3
y_pred_test = lr.predict(X_test_scaled)


In [None]:
print('R^2:', r2_score(y_test, y_pred_test))
print('MAE:', mean_absolute_error(y_test, y_pred_test))
print('MSE:', mean_squared_error(y_test, y_pred_test))
print('RMSE:',np.sqrt(mean_squared_error(y_test, y_pred_test)))

R^2: 0.6165667257089236
MAE: 3.2740080075261395
MSE: 45.135223546740725
RMSE: 6.718275340200097
