In [1]:
%load_ext autoreload
%autoreload 2
from sklearn import set_config; set_config(display='diagram')

In [4]:
from TaxiFareModel.trainer import Trainer
from TaxiFareModel.data import get_data, clean_data
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


N = 10_000
df = get_data(nrows=N)
# clean data
df = clean_data(df)
# Delete 1.January 2009
df = df[df['key'].str.contains("2009-01-01") == False]
# set X and y
y = df["fare_amount"]
X = df.drop("fare_amount", axis=1)
# hold out
X_train, X_test, y_train, y_test = train_test_split(X, y,
    random_state=42,
    test_size=0.3)
# train
models = [CatBoostRegressor(verbose=False),
AdaBoostRegressor(), LassoCV(), RidgeCV(),LinearRegression(), RandomForestRegressor(), KNeighborsRegressor()]
best_models=[CatBoostRegressor(verbose=False)]

for model in best_models:
    print(model)
    trainer = Trainer(X_train, y_train, model)
    trainer.run()
    # evaluate
    res = trainer.evaluate(X_test, y_test)
    #save_model
    trainer.save_model()

    print(res)

<catboost.core.CatBoostRegressor object at 0x12975a6d0>
[0.80859214 0.83037448 0.78061464 0.76618677 0.72694741]
5.16558197777153


In [19]:
from sklearn.compose import ColumnTransformer
from TaxiFareModel.encoders import TimeFeaturesEncoder, DistanceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipe_time = make_pipeline(TimeFeaturesEncoder(time_column='pickup_datetime'), OneHotEncoder())
pipe_distance = make_pipeline(DistanceTransformer(),StandardScaler())

time_col = ['pickup_datetime']
dist_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
feat_eng_pipeline = ColumnTransformer([
            ('time', pipe_time, time_col),
            ('distance', pipe_distance, dist_cols)
            ])

In [24]:
X_transformed = pipe_time.fit_transform(X_train,y_train)

In [25]:
X_transformed

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>

In [23]:
time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

Unnamed: 0,dow,hour,month,year
0,2,9,3,2014
1,2,13,8,2013
2,0,20,6,2010
3,4,20,4,2012
4,0,17,9,2009


In [18]:
pipe_time.fit_transform(X_train, y_train)

<5971x50 sparse matrix of type '<class 'numpy.float64'>'
	with 23884 stored elements in Compressed Sparse Row format>

In [2]:
import pandas
from TaxiFareModel.data import get_data, clean_data
from sklearn.model_selection import train_test_split, cross_validate
df = get_data()
# clean data
df = clean_data(df)
df = df[df['key'].str.contains("2009-01-01") == False]
# set X and y
y = df["fare_amount"]
X = df.drop("fare_amount", axis=1)
# hold out
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.2)
# train

In [3]:
X_train.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
662,2011-06-30 08:08:14.0000003,2011-06-30 08:08:14 UTC,-73.959055,40.814999,-73.957878,40.778607,1
3175,2012-06-23 17:10:29.0000001,2012-06-23 17:10:29 UTC,-73.978019,40.766622,-73.97753,40.753604,3
913,2014-04-08 14:11:00.00000085,2014-04-08 14:11:00 UTC,-73.966193,40.758177,-73.991642,40.74908,1
8083,2011-05-05 18:45:45.0000005,2011-05-05 18:45:45 UTC,-73.873067,40.774135,-73.974239,40.756599,1
5538,2014-01-05 13:40:30.0000001,2014-01-05 13:40:30 UTC,-73.958982,40.780754,-73.789114,40.641712,1


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from TaxiFareModel.utils import haversine_vectorized, calculate_direction, minkowski_distance_gps
from TaxiFareModel.encoders import MinkowskiDistance
from sklearn.preprocessing import StandardScaler

In [8]:
encoder = MinkowskiDistance()

In [9]:
encoder.fit_transform(X_train)

Unnamed: 0,MinkowskiDistance_1
0,14941.737197
1,14940.622614
2,14942.422545
3,14940.725708
4,14903.351750
...,...
6819,14896.736404
6820,14939.174714
6821,14943.179508
6822,14939.761383


In [10]:
scaler =StandardScaler()
scaler.fit_transform(encoder.fit_transform(X_train))

array([[0.52521057],
       [0.37690647],
       [0.61640156],
       ...,
       [0.71712159],
       [0.26231292],
       [1.21192538]])

In [4]:
import requests
url = "http://127.0.0.1:8000/predict_fare?key=2013-07-06 17:18:00.0000001&pickup_datetime=2013-07-06 17:18:00 UTC&pickup_longitude=-73.950655&pickup_latitude=40.783282&dropoff_longitude=-73.984365&dropoff_latitude=40.769802&passenger_count=1"
response=requests.get(url).json()
response

{'prediction': '12.539891'}

In [70]:
import joblib
loaded_model = joblib.load('../model_xgboost.joblib')

In [71]:
array={'key': 1, 'pickup_datetime': '2013-07-06 17:18:00 UTC', 'pickup_longitude': '-73.950655', 'pickup_latitude': '40.783282', 'dropoff_longitude': '-73.984365', 'dropoff_latitude': '40.769802', 'passenger_count': '1'}

In [72]:
array

{'key': 1,
 'pickup_datetime': '2013-07-06 17:18:00 UTC',
 'pickup_longitude': '-73.950655',
 'pickup_latitude': '40.783282',
 'dropoff_longitude': '-73.984365',
 'dropoff_latitude': '40.769802',
 'passenger_count': '1'}

In [73]:
X_pred = pd.DataFrame({k: [v] for k, v in array.items()})

In [74]:
X_pred.iloc[:,2:6] = X_pred.iloc[:,2:6].astype('float64')
X_pred.iloc[:,6] = X_pred.iloc[:,6].astype('int64')

In [75]:
X_pred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   key                1 non-null      int64  
 1   pickup_datetime    1 non-null      object 
 2   pickup_longitude   1 non-null      float64
 3   pickup_latitude    1 non-null      float64
 4   dropoff_longitude  1 non-null      float64
 5   dropoff_latitude   1 non-null      float64
 6   passenger_count    1 non-null      int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 184.0+ bytes


In [76]:
import pandas as pd
y_pred = loaded_model.predict(X_pred)

In [77]:
y_pred

array([12.539891], dtype=float32)

In [47]:
import requests
url = "http://localhost:8000/"
response=requests.get(url).json()
response

{'greeting': 'Hello world Felix!!!!!!'}

In [48]:
import requests
url = "http://localhost:8000/predict_fare?key=2014-08-06 10:18:00.0000001&pickup_datetime=2013-08-06 17:18:00 UTC&pickup_longitude=-73.950655&pickup_latitude=40.783282&dropoff_longitude=-73.984365&dropoff_latitude=40.769802&passenger_count=1"
response=requests.get(url).json()
response

JSONDecodeError: Expecting value: line 1 column 1 (char 0)