In [1]:
%load_ext autoreload
%autoreload 2
from sklearn import set_config; set_config(display='diagram')

In [4]:
from TaxiFareModel.trainer import Trainer
from TaxiFareModel.data import get_data, clean_data
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


N = 10_000
df = get_data(nrows=N)
# clean data
df = clean_data(df)
# Delete 1.January 2009
df = df[df['key'].str.contains("2009-01-01") == False]
# set X and y
y = df["fare_amount"]
X = df.drop("fare_amount", axis=1)
# hold out
X_train, X_test, y_train, y_test = train_test_split(X, y,
    random_state=42,
    test_size=0.3)
# train
models = [CatBoostRegressor(verbose=False),
AdaBoostRegressor(), LassoCV(), RidgeCV(),LinearRegression(), RandomForestRegressor(), KNeighborsRegressor()]
best_models=[CatBoostRegressor(verbose=False)]

for model in best_models:
    print(model)
    trainer = Trainer(X_train, y_train, model)
    trainer.run()
    # evaluate
    res = trainer.evaluate(X_test, y_test)
    #save_model
    trainer.save_model()

    print(res)

<catboost.core.CatBoostRegressor object at 0x12975a6d0>
[0.80859214 0.83037448 0.78061464 0.76618677 0.72694741]
5.16558197777153


In [19]:
from sklearn.compose import ColumnTransformer
from TaxiFareModel.encoders import TimeFeaturesEncoder, DistanceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipe_time = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder())
pipe_distance = make_pipeline(DistanceTransformer(),StandardScaler())

time_col = ['pickup_datetime']
dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
feat_eng_pipeline = ColumnTransformer([
            ('time', pipe_time, time_col),
            ('distance', pipe_distance, dist_cols)
            ])

In [24]:
X_transformed = pipe_time.fit_transform(X_train,y_train)

In [25]:
X_transformed

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>

In [23]:
time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

Unnamed: 0,dow,hour,month,year
0,2,9,3,2014
1,2,13,8,2013
2,0,20,6,2010
3,4,20,4,2012
4,0,17,9,2009


In [18]:
pipe_time.fit_transform(X_train, y_train)

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>

In [3]:
import joblib
import pandas as pd


# load the model from disk
loaded_model = joblib.load('../pipeline.joblib')
test_sample = pd.read_csv('../raw_data/test.csv')

In [4]:
test_sample

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.973320,40.763805,-73.981430,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.751260,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.981160,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1
...,...,...,...,...,...,...,...
9909,2015-05-10 12:37:51.0000002,2015-05-10 12:37:51 UTC,-73.968124,40.796997,-73.955643,40.780388,6
9910,2015-01-12 17:05:51.0000001,2015-01-12 17:05:51 UTC,-73.945511,40.803600,-73.960213,40.776371,6
9911,2015-04-19 20:44:15.0000001,2015-04-19 20:44:15 UTC,-73.991600,40.726608,-73.789742,40.647011,6
9912,2015-01-31 01:05:19.0000005,2015-01-31 01:05:19 UTC,-73.985573,40.735432,-73.939178,40.801731,6


In [5]:
y_pred = loaded_model.predict(test_sample)

In [6]:
y_pred.shape

(9914,)

In [22]:
test_sample['fare_amount']=pd.Series(y_pred)

In [23]:
test_sample.head(5)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,fare_amount
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1,10.161814
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1,11.430199
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1,4.298393
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1,8.476216
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1,16.802706


In [25]:
eval_df=test_sample[['key','fare_amount']]

In [26]:
eval_df.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.161814
1,2015-01-27 13:08:24.0000003,11.430199
2,2011-10-08 11:53:44.0000002,4.298393
3,2012-12-01 21:12:12.0000002,8.476216
4,2012-12-01 21:12:12.0000003,16.802706


In [7]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true) ** 2).mean())

In [8]:
eval_res = compute_rmse(y_pred,t)
#self.mlflow_log_metric('rmse', eval_res)

NameError: name 'y_test' is not defined