# Refactor Taxi Fare Prediction Problem with a Pipeline

Refactor the model you built for the Taxi Fare Prediction Problem using:
- Custom encoders you have to write for distance and time features
- OneHot Encoder to encoder hour and day of week features
- SimpleImputer to fill missing values
- A simple linear regression
- A pipeline to put all together


Then: 
- train this pipeline
- apply the pipeline on test data
- generate predictions and submit these new predictions to Kaggle

## First pipeline

In [1]:
# import the dataset from s3 bucket 
import pandas as pd
url = "s3://wagon-public-datasets/taxi-fare-train.csv"

# Select only 10 000 rows while creating the DataFrame
df = pd.read_csv(url, nrows=10_000)

In [2]:
df.head()


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
df.drop(columns=['fare_amount'])

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.841610,40.712278,1
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00 UTC,-73.982738,40.761270,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42 UTC,-73.987130,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
...,...,...,...,...,...,...,...
9995,2011-10-26 10:44:00.00000086,2011-10-26 10:44:00 UTC,-73.988277,40.748970,-73.963712,40.773958,2
9996,2011-12-16 15:37:00.000000179,2011-12-16 15:37:00 UTC,-74.002112,40.748727,-73.992467,40.756252,1
9997,2013-11-16 22:47:17.0000001,2013-11-16 22:47:17 UTC,-73.992093,40.729071,-73.974470,40.763050,2
9998,2010-01-28 11:38:00.00000022,2010-01-28 11:38:00 UTC,-73.992548,40.735652,-73.998802,40.723085,1


In [4]:
# prepare X and y
y = df['fare_amount']
X = df.drop(columns=['fare_amount'])

# Hold out 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Custom transformers

With the Taxi Fare Prediction Challenge data, using `BaseEstimator` and `TransformerMixin`, implement:

- a transformer that computes haversine distance between pickup and dropoff location
- a custom encoder that extract time features from `pickup_datetime`

In [6]:
import numpy as np

def haversine_vectorized(df, 
         start_lat="pickup_latitude",
         start_lon="pickup_longitude",
         end_lat="dropoff_latitude",
         end_lon="dropoff_longitude"):

    """ 
        Calculate the great circle distance between two points 
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df
        Computes distance in kms
    """

    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [7]:
# Implement the `tarnsform`  method of the DistanceTransformer
from sklearn.base import BaseEstimator,  TransformerMixin 

class DistanceTransformer(BaseEstimator, TransformerMixin):
    """Compute the haversine distance between two GPS points."""

    def __init__(self, 
                 start_lat="pickup_latitude",
                 start_lon="pickup_longitude", 
                 end_lat="dropoff_latitude", 
                 end_lon="dropoff_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only one column: 'distance'"""
        df_distance = pd.DataFrame(haversine_vectorized(X)).rename(columns={0:'Distance'})
        return df_distance

In [8]:
# test the DistanceTransformer
dist_trans = DistanceTransformer()
distance = dist_trans.fit_transform(X_train, y_train)
distance.head(10)

Unnamed: 0,Distance
7985,3.157168
8216,1.487855
944,1.501492
4545,0.719094
8779,1.84991
6473,2.305177
8365,1.813182
5544,2.584241
5240,1.812177
4426,1.370125


In [9]:
def extract_time_features(df):
    timezone_name = 'America/New_York'
    time_column = "pickup_datetime"
    df.index = pd.to_datetime(df[time_column])
    df.index = df.index.tz_convert(timezone_name)
    df["dow"] = df.index.weekday
    df["hour"] = df.index.hour
    df["month"] = df.index.month
    df["year"] = df.index.year
    return df.reset_index(drop=True)

In [10]:
# Implement the `transform` method of the TimeFeaturesEncoder
class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """Extract the day of week (dow), the hour, the month and the year from a time column."""

    def __init__(self, time_column, time_zone_name='America/New_York'):
        self.time_column = time_column
        self.time_zone_name = time_zone_name
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        """Returns a copy of the DataFrame X with only four columns: 'dow', 'hour', 'month', 'year'"""
        df_time = extract_time_features(X.copy())
        return df_time[['dow', 'hour', 'month', 'year']]

In [11]:
# test the TimeFeaturesEncoder
time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

Unnamed: 0,dow,hour,month,year
0,3,9,6,2009
1,3,18,10,2014
2,0,12,6,2013
3,5,17,6,2012
4,5,6,8,2011


###  Prepocessing pipeline

In [12]:
# visualizing pipelines in HTML
from sklearn import set_config; set_config(display='diagram')

#### Distance pipeline

Create a pipeline for distances:
- convert pickup and dropoff coordinates into distances with the DistanceTransformer
- standardize these distances

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# create distance pipeline
pipe_distance = Pipeline(steps=[('distance_transformer', DistanceTransformer()), ('standard_scaler', StandardScaler())])
# display distance pipeline
pipe_distance

#### Time features pipeline

Create a pipeline for time features
- extract time features from pickup datetime with the TimeFeaturesEncoder
- encode these categorical time features with the OneHotEncoder

In [14]:
from sklearn.preprocessing import OneHotEncoder
# create time pipeline
pipe_time = Pipeline(steps=[('time_encoder', TimeFeaturesEncoder('pickup_datetime')), ('onehotencoder', OneHotEncoder())])
# display time pipeline
pipe_time

#### Preprocessing pipeline

Wrap up the distance pipeline and the time pipeline into a preprocesssing pipeline.

In [15]:
from sklearn.compose import ColumnTransformer
# create preprocessing pipeline
# column transformer
dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
time_cols = ['pickup_datetime']
# column transformer
preprocessing_pipe = ColumnTransformer([('time', pipe_time, time_cols),
                                  ('distance', pipe_distance, dist_cols)]
                                  ) # remainder='passthrough'
# display preprocessing pipeline
preprocessing_pipe

### Model pipeline

Create a pipeline containing the preprocessing and the regression model of your choice.

In [16]:
# Add the model of your choice to the pipeline
from sklearn.linear_model import LinearRegression
pipeline = Pipeline(steps=[('preproc', preprocessing_pipe),
                            ('regressor', LinearRegression())])

# display the pipeline with model
pipeline

<details>
    <summary>
       💡 Hint
    </summary>
The pipeline should look like
<img src='img/pipeline.png'>
</details>

### Training and performance

Train the pipelined model and compute prediction on the test set:

In [17]:
# Train the pipelined model
fitted_pipe = pipeline.fit(X_train, y_train)

In [27]:
# compute y_pred on the test set
y_pred = fitted_pipe.predict(X_test)

Use the RMSE to evaluate the model's performance:

def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true)**2).mean())

# call compute_rmse
compute_rmse(y_pred, y_test)

## Complete workflow with a pipeline

Here we will implement the whole workflow for our Taxifare kaggle challenge.  

For that we will refactor code in functions for more clarity.  

Implement following functions:  
- `get_data()` to fetch data from local path
- `clean_data()` to clean data
- `get_pipeline()` to get the pipeline defined earlier
- `train()` to train our model
- `evaluate()` to evaluate our model on test data

In [39]:
# implement get_data() function
def get_data(nrows=10000):
    '''returns a DataFrame with nrows from s3 bucket'''
    url = "s3://wagon-public-datasets/taxi-fare-train.csv"
    df = pd.read_csv(url, nrows=nrows)
    return df

In [30]:
#implement clean_data() function
def clean_data(df, test=False):
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 0]
    df = df[df["pickup_latitude"].between(40, 42)]
    df = df[df["pickup_longitude"].between(-74.3, -72.9 )]
    df = df[df["dropoff_latitude"].between(40, 42)]
    df = df[df["dropoff_longitude"].between(-74, -72.9)]
    return df


In [54]:
# implement set_pipeline() function
def set_pipeline():
    '''returns a pipelined model'''
    pipe_distance = Pipeline(steps=[('distance_transformer', DistanceTransformer()),\
                                    ('standard_scaler', StandardScaler())])
    pipe_time = Pipeline(steps=[('time_encoder', TimeFeaturesEncoder('pickup_datetime'))\
                                , ('onehotencoder', OneHotEncoder())])
    dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
    time_cols = ['pickup_datetime']
    preprocessing_pipe = ColumnTransformer([('time', pipe_time, time_cols),
                                  ('distance', pipe_distance, dist_cols)]) 
    pipeline = Pipeline(steps=[('preproc', preprocessing_pipe),
                            ('regressor', LinearRegression())])
    return pipeline

In [55]:
#implement train() function
def train(X_train, y_train, pipeline):
    '''returns a trained pipelined model'''
    fitted_pipe = pipeline.fit(X_train, y_train)
    return fitted_pipe

In [56]:
#implement evaluate() function
def evaluate(X_test, y_test, pipeline):
    '''prints and returns the value of the RMSE'''
    y_pred = fitted_pipe.predict(X_test)
    RMSE = compute_rmse(y_pred, y_test)
    print('RMSE = ', RMSE)
    return RMSE

### Test the complete worflow

Use the above functions to test the complete workflow.

In [71]:
# store the data in a DataFrame
df = clean_data(get_data(nrows=10000))
# set X and y
y = df['fare_amount']
X = df.drop(columns=['fare_amount'])
# Hold out 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [72]:
# build pipeline
pipeline = set_pipeline()
# train the pipeline
fitted_pipe = train(X_train, y_train, pipeline)
fitted_pipe

In [73]:
# evaluate the pipeline
evaluate(X_test, y_test, fitted_pipe)

RMSE =  6.24542313887469


6.24542313887469

### Congrats!

Now we are ready to convert this complete workflow into a packaged code 🚀