In [None]:
import os
from pathlib import Path

import aiobotocore
import pandas as pd
from dotenv import load_dotenv
from numpy import dtype
from s3fs import S3FileSystem
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split


In [None]:
load_dotenv()
[os.getenv("AWS_PROFILE"), os.getenv("TRAINING_DIR")]

In [None]:
tmpdir = Path.cwd().parent / "tmp"
s3 = S3FileSystem(session=aiobotocore.session.AioSession(profile=os.getenv("AWS_PROFILE")))
bucket_root = f"s3://{os.getenv("BUCKET_NAME")}/ny_taxi_trip_prediction"

if os.getenv("TRAINING_DIR"):
    training_base_dir = os.getenv("TRAINING_DIR")
else:
    training_base_dir = s3.read_text(f"{bucket_root}/current")

training_root = f"{bucket_root}/training/{training_base_dir}"

train_path = tmpdir / "train.parquet"
if not train_path.is_file():
    s3.get_file(training_root + "/train.parquet", train_path)
df_train_val_all = pd.read_parquet(train_path)

test_path = tmpdir / "test.parquet"
if not test_path.is_file():
    s3.get_file(training_root + "/test.parquet", test_path)
df_test_all = pd.read_parquet(test_path)


In [None]:
assert df_train_val_all['PULocationID'].dtypes is dtype("int32")
assert df_test_all['PULocationID'].dtypes is dtype("int32")

In [None]:
df_train_val_all.head()

In [None]:
df_train_val_all[df_train_val_all.index.duplicated()]

In [None]:
# def preprocess(df: pd.DataFrame, dict_vec: DictVectorizer | None) -> [any, any, DictVectorizer]:
#     """ Returns x, y, and dict vectorizer. If dict vectorizer was supplied in arguments, does not fit, only transforms.
#     """
#     dict_vec_fit = False
#     if dict_vec is None:
#         dict_vec = DictVectorizer()
#         dict_vec_fit = True
#
#     dfp = pd.DataFrame(index=df.index)
#     dfp["duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).apply(lambda timediff: timediff.total_seconds())
#     dfp = dfp[(dfp.duration_min >= 1) & (dfp.duration_min <= 60)]
#     dfp["loc_id"] = df["PULocationID"].astype(str) + "-" + df["DOLocationID"].astype(str)
#     dfp["trip_distance"] = df["trip_distance"]
#
#     dfp.reset_index(inplace=True, drop=True)
#
#     return dfp, dict_vec

In [None]:
def preprocess(df: pd.DataFrame, dict_vec: DictVectorizer | None = None) -> [any, any, DictVectorizer]:
    """ Returns x, y, and dict vectorizer. If dict vectorizer was supplied in arguments, does not fit, only transforms.
    """
    dict_vec_fit = False
    if dict_vec is None:
        dict_vec = DictVectorizer()
        dict_vec_fit = True

    tmpdf = pd.DataFrame(index=df.index)
    tmpdf["duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).apply(
        lambda timediff: timediff.total_seconds())
    tmpdf = tmpdf[(tmpdf["duration_min"] >= 1) & (tmpdf["duration_min"] <= 60)]
    tmpdf["loc_id"] = df["PULocationID"].astype(str) + "-" + df["DOLocationID"].astype(str)
    tmpdf["trip_distance"] = df["trip_distance"]
    print("tmpdf")
    display(tmpdf)

    x_dicts = tmpdf[["loc_id", "trip_distance"]].to_dict(orient="records")
    print("x_dicts")
    display(x_dicts)
    if dict_vec_fit:
        x = dict_vec.fit_transform(x_dicts)
    else:
        x = dict_vec.transform(x_dicts)
    print("x")
    display(x)
    y = tmpdf["duration_min"].values
    print("y")
    display(y)

    return x, y, dict_vec

In [None]:
# Take first 80% for training set, last 20% for validation set. Order DOES matter because we want to use later data for validation as it is in theory closer to reality
xy_train_all, xy_val_all = train_test_split(df_train_val_all, test_size=0.2, shuffle=False)
display(xy_train_all.head())
display(xy_val_all.head())

In [None]:
x_train, y_train, dv = preprocess(xy_train_all)

In [None]:
x_val, y_val, _ = preprocess(xy_val_all, dv)

In [None]:
x_test, y_test, _ = preprocess(df_test_all, dv)


In [39]:
model = RandomForestRegressor(max_depth=10, random_state=0)
model.fit(x_train, y_train)
y_pred_val = model.predict(x_val)
y_pred_test = model.predict(x_test)

print(f"RMSE of validation set: {float(root_mean_squared_error(y_val, y_pred_val))}")
print(f"RMSE of test set: {float(root_mean_squared_error(y_test, y_pred_test))}")

RMSE of validation set: 13.783918857799998
RMSE of test set: 13.382332322020392


In [None]:
print("Pick up location IDs")
display(xy_train_all["PULocationID"].sort_values().unique())
print("Drop off location IDs")
display(xy_train_all["DOLocationID"].sort_values().unique())