In [1]:
%load_ext autoreload
%autoreload 2
from sklearn import set_config; set_config(display='diagram')

In [4]:
from TaxiFareModel.trainer import Trainer
from TaxiFareModel.data import get_data, clean_data
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


N = 10_000
df = get_data(nrows=N)
# clean data
df = clean_data(df)
# Delete 1.January 2009
df = df[df['key'].str.contains("2009-01-01") == False]
# set X and y
y = df["fare_amount"]
X = df.drop("fare_amount", axis=1)
# hold out
X_train, X_test, y_train, y_test = train_test_split(X, y,
    random_state=42,
    test_size=0.3)
# train
models = [CatBoostRegressor(verbose=False),
AdaBoostRegressor(), LassoCV(), RidgeCV(),LinearRegression(), RandomForestRegressor(), KNeighborsRegressor()]
best_models=[CatBoostRegressor(verbose=False)]

for model in best_models:
    print(model)
    trainer = Trainer(X_train, y_train, model)
    trainer.run()
    # evaluate
    res = trainer.evaluate(X_test, y_test)
    #save_model
    trainer.save_model()

    print(res)

<catboost.core.CatBoostRegressor object at 0x12975a6d0>
[0.80859214 0.83037448 0.78061464 0.76618677 0.72694741]
5.16558197777153


In [19]:
from sklearn.compose import ColumnTransformer
from TaxiFareModel.encoders import TimeFeaturesEncoder, DistanceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipe_time = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder())
pipe_distance = make_pipeline(DistanceTransformer(),StandardScaler())

time_col = ['pickup_datetime']
dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
feat_eng_pipeline = ColumnTransformer([
            ('time', pipe_time, time_col),
            ('distance', pipe_distance, dist_cols)
            ])

In [24]:
X_transformed = pipe_time.fit_transform(X_train,y_train)

In [25]:
X_transformed

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>

In [23]:
time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

Unnamed: 0,dow,hour,month,year
0,2,9,3,2014
1,2,13,8,2013
2,0,20,6,2010
3,4,20,4,2012
4,0,17,9,2009


In [18]:
pipe_time.fit_transform(X_train, y_train)

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>