In [6]:
!pip freeze | grep scikit-learn

scikit-learn==1.1.1


In [1]:
import os
import pickle
import pandas as pd
import argparse

In [2]:
model_path = "model.bin"
taxi_type = "fhv"

In [3]:
def read_data(filename: str):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
    
    return df

def prepare_dictionaries(df: pd.DataFrame):
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    dicts = df[categorical].to_dict(orient='records')
    
    return dicts

In [4]:
def load_model(model_path: str):
    with open(model_path, 'rb') as f_in:
        dv, lr = pickle.load(f_in)
    return dv, lr

def apply_model(input_file: str, output_file: str, year: int, month: int):

    df = read_data(input_file)
    df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
    
    dicts = prepare_dictionaries(df)
    dv, lr = load_model(model_path)

    X_val = dv.transform(dicts)
    y_pred = lr.predict(X_val)

    df_result = pd.DataFrame()
    df_result["ride_id"] = df.ride_id.copy()
    df_result["predictions"] = y_pred
    
    df_result.to_parquet(
        output_file,
        engine='pyarrow',
        compression=None,
        index=False
    )

    return df_result

In [5]:
def get_paths(year, month):

    input_file = f"https://nyc-tlc.s3.amazonaws.com/trip+data/{taxi_type}_tripdata_{year:04d}-{month:02d}.parquet"
    output_path = f"output/{taxi_type}"

    return input_file, output_path