<a href="https://colab.research.google.com/github/edcalderin/course-mlops-zoomcamp/blob/master/homeworks/01_intro_homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experimenting tracking

## Downloading the data

In [22]:
%%capture
!mkdir data
!python -m wget -o data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet
!python -m wget -o data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet

In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
import mlflow

mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('my-experiment')

<Experiment: artifact_location='file:///c:/Users/Erick/Projects/course-mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1684721319647, experiment_id='1', last_update_time=1684721319647, lifecycle_stage='active', name='my-experiment', tags={}>

## Reading data

In [5]:
green_tripdata_2022 = pd.read_parquet('./data/green/green_tripdata_2022-01.parquet')
green_tripdata_2022.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2022-01-01 00:14:21,2022-01-01 00:15:33,N,1.0,42,42,1.0,0.44,3.5,0.5,0.5,0.0,0.0,,0.3,4.8,2.0,1.0,0.0
1,1,2022-01-01 00:20:55,2022-01-01 00:29:38,N,1.0,116,41,1.0,2.1,9.5,0.5,0.5,0.0,0.0,,0.3,10.8,2.0,1.0,0.0
2,1,2022-01-01 00:57:02,2022-01-01 01:13:14,N,1.0,41,140,1.0,3.7,14.5,3.25,0.5,4.6,0.0,,0.3,23.15,1.0,1.0,2.75
3,2,2022-01-01 00:07:42,2022-01-01 00:15:57,N,1.0,181,181,1.0,1.69,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2022-01-01 00:07:50,2022-01-01 00:28:52,N,1.0,33,170,1.0,6.26,22.0,0.5,0.5,5.21,0.0,,0.3,31.26,1.0,1.0,2.75


In [3]:
data_january = pd.read_parquet('data/yellow_tripdata_2022-01.parquet')
data_january.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


## Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.

In [6]:
def calculate_duration(df:pd.DataFrame):
    duration = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    return duration.dt.total_seconds().div(60).astype(int)

data_january['duration'] = calculate_duration(data_january)

## Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

In [7]:
data_january = data_january[(data_january.duration>=1) & (data_january.duration<=60)]
data_january.shape

(2423325, 20)

## One-hot encoding

In [8]:
def convert_to_dicts(df:pd.DataFrame, category_names=['PULocationID', 'DOLocationID']):
    df[category_names] = df[category_names].astype('str')

    return df[category_names].to_dict(orient='records')

In [9]:
dv = DictVectorizer()

X_train = dv.fit_transform(convert_to_dicts(data_january))

[fn for fn in dv.feature_names_[:10]]

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=105',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109']

## Training a model

In [10]:
# Model creation
lr_model = LinearRegression()

# Training
y_train = data_january.duration
lr_model.fit(X_train, y_train)

# Evaluation

def rmse(model, feature_matrix, y):
    predicted = model.predict(feature_matrix)

    return mean_squared_error(y, predicted, squared=False)

print(rmse(lr_model, X_train, data_january.duration))

7.0167202695956865


## Evaluating the model

In [19]:
data_february = pd.read_parquet('data/yellow_tripdata_2022-02.parquet')

# Calculating duration
data_february['duration'] = calculate_duration(data_february)

# Dropping outliers
data_february = data_february[(data_february.duration>=1) & (data_february.duration<=60)]

# Transforming february's data
X_test, y_test = dv.transform(convert_to_dicts(data_february)), data_february.duration

rmse_value = rmse(lr_model, X_test, y_test)
# Evaluating
print(rmse_value)

7.827338705733996


## Persisting model

In [None]:
!mkdir models

In [None]:
FILE_NAME = "models/lin_reg.bin"

with open(FILE_NAME, 'wb') as file:
    pickle.dump((dv, lr_model), file)

In [12]:
with mlflow.start_run():
    mlflow.set_tag('developer', 'erick')
    
    mlflow.log_param('train_path', 'data/yellow_tripdata_2022-01.parquet')
    mlflow.log_param('test_path', 'data/yellow_tripdata_2022-02.parquet')
    
    mlflow.log_metric('rmse', rmse_value)

## XGBoost

In [13]:
import xgboost as xgb
from hyperopt import hp, STATUS_OK, fmin, Trials, tpe
from hyperopt.pyll import scope

In [27]:
train_matrix = xgb.DMatrix(X_train, label=y_train)
test_matrix = xgb.DMatrix(X_test, label=y_test)

In [45]:
def objective(params):
    with mlflow.start_run():
        
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        
        booster = xgb.train(
            params=params,
            dtrain=train_matrix,
            num_boost_round=10,
            evals=[(test_matrix, 'validation')],
            early_stopping_rounds=5
        )
        
        y_pred = booster.predict(test_matrix)
        
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [46]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=5,
    trials=Trials()
)

[0]	validation-rmse:12.49952                         
[1]	validation-rmse:10.04046                         
[2]	validation-rmse:8.68651                          
[3]	validation-rmse:7.44227                          
[4]	validation-rmse:6.95380                          
[5]	validation-rmse:6.62929                          
[6]	validation-rmse:6.41935                          
[7]	validation-rmse:6.30227                          
[8]	validation-rmse:5.97770                          
[9]	validation-rmse:5.92859                          
[0]	validation-rmse:10.49337                                                  
[1]	validation-rmse:7.56423                                                   
[2]	validation-rmse:6.70396                                                   
[3]	validation-rmse:6.36022                                                   
[4]	validation-rmse:6.07183                                                   
[5]	validation-rmse:5.98620                                      