# Week-01: Homework

In [30]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [31]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/02-experiment_tracking/mlruns/1', creation_time=1716290291829, experiment_id='1', last_update_time=1716290291829, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## 1. Downloading and loading the data
We'll use the NYC taxi dataset, we'll use "Yellow Taxi Trip Records".

Downloading the data for January and February 2023.

In [32]:
jan_filepath = "/workspaces/mlops-zoomcamp/data/green_tripdata_2023-01.parquet"
jan_df = pd.read_parquet(jan_filepath)

feb_filepath = "/workspaces/mlops-zoomcamp/data/green_tripdata_2023-02.parquet"
feb_df = pd.read_parquet(feb_filepath)

In [None]:
print(len(jan_df.columns), jan_df.columns)

## 2. Computing duration
Let's compute the duration variable. It should contain the duration of a ride in minutes.
`duration = tpep_dropoff_datetime - tpep_pickup_datetime`

In [None]:
jan_df.dtypes

In [33]:
def calculateDuration(df):
    df["duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime

    df["duration"] = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

In [34]:
jan_df = calculateDuration(jan_df)
feb_df = calculateDuration(feb_df)

In [None]:
jan_df.duration.describe()

In [None]:
feb_df.duration.describe()

## 3. Dropping outliers
Next, let's check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the `duration` was between 1 and 60 minutes (inclusive).

In [35]:
def drop_outliers_by_duration(df, minD, maxD):
    print("Fraction left: ",((df.duration >= minD) & (df.duration <= maxD)).mean())
    return df[(df.duration >= minD) & (df.duration <= maxD)]

In [36]:
jan_df = drop_outliers_by_duration(jan_df, 1, 60)
feb_df = drop_outliers_by_duration(feb_df, 1, 60)

Fraction left:  0.9667942120772309
Fraction left:  0.9655140489746794


## 4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it

In [37]:
def transform_locationID_toStr(df):
    df["PULocationID"] = df["PULocationID"].astype(str)
    df["DOLocationID"] = df["DOLocationID"].astype(str)
    return df

In [38]:
jan_df = transform_locationID_toStr(jan_df)
feb_df = transform_locationID_toStr(feb_df)

In [39]:
categorical = ["PULocationID", "DOLocationID"]
numerical = []
target = "duration"
def train_val_data(train_df, val_df):
    train_dicts = train_df[categorical + numerical].to_dict(orient="records")
    val_dicts = val_df[categorical+numerical].to_dict(orient="records")
    
    dv = DictVectorizer()
    
    x_train = dv.fit_transform(train_dicts)
    print("Dimensionality of feature matrix:", x_train.shape)
    x_val = dv.transform(val_dicts)

    y_train = train_df[target].values
    y_val = val_df[target].values
    return x_train, y_train, x_val, y_val, dv

In [40]:
x_train, y_train, x_val, y_val, dv = train_val_data(train_df=jan_df, val_df=feb_df)

Dimensionality of feature matrix: (65946, 467)


## 5. Training a model
Let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters
- Calculate the RMSE of the model on the training data

In [None]:
model = LinearRegression()
model.fit(x_train,y_train)

In [None]:
y_pred_on_train = model.predict(x_train)
print("RMSE on train: ",mean_squared_error(y_pred=y_pred_on_train, y_true=y_train, squared=False))

In [None]:
y_pred_on_val = model.predict(x_val)
print("RMSE on validation: ",mean_squared_error(y_pred=y_pred_on_val, y_true=y_val, squared=False))

## 6. Saving the model
Finally, let's use pickle and store our trained model

In [None]:
import pickle

with open("/workspaces/mlops-zoomcamp/models/lin_reg.bin", 'wb') as f_out:
    pickle.dump(model, f_out)

In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "denil")
    mlflow.log_param("train-data-path", jan_filepath)
    mlflow.log_param("valid-data-path", feb_filepath)

    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    model = Lasso(alpha)
    model.fit(x_train,y_train)
    y_pred_on_train = model.predict(x_train)
    rmse = mean_squared_error(y_pred=y_pred_on_train, y_true=y_train, squared=False)

    mlflow.log_metric("rmse ", rmse)
    mlflow.log_artifact(local_path="../models/lin_reg.bin", artifact_path="models_pickle")

## trying xgboost

In [None]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=5
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
    )

In [None]:
params = {
    "learning_rate": 0.3911485255644647,
    "max_depth": 96,
    "min_child_weight": 2.438960793020336,
    "objective":"reg:linear",
    "reg_alpha": 0.2924504415949103,
    "reg_lambda": 0.27323946210032624,
    "seed": 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=5
        )



In [43]:
import pickle

with mlflow.start_run():
    params = {
        "learning_rate": 0.3911485255644647,
        "max_depth": 96,
        "min_child_weight": 2.438960793020336,
        "objective":"reg:linear",
        "reg_alpha": 0.2924504415949103,
        "reg_lambda": 0.27323946210032624,
        "seed": 42
    }

    mlflow.log_params(params)

    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=100,
                evals=[(valid, "validation")],
                early_stopping_rounds=5
            )
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", 'wb') as f_out:
        pickle.dump(dv, f_out)


    mlflow.log_artifact(local_path="models/preprocessor.b",artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")


[0]	validation-rmse:7.53941




[1]	validation-rmse:6.76976
[2]	validation-rmse:6.38174
[3]	validation-rmse:6.20876
[4]	validation-rmse:6.09826
[5]	validation-rmse:6.05714
[6]	validation-rmse:6.03301
[7]	validation-rmse:6.01642
[8]	validation-rmse:6.00375
[9]	validation-rmse:5.98911
[10]	validation-rmse:5.98233
[11]	validation-rmse:5.96332
[12]	validation-rmse:5.96114
[13]	validation-rmse:5.95414
[14]	validation-rmse:5.95319
[15]	validation-rmse:5.94962
[16]	validation-rmse:5.94734
[17]	validation-rmse:5.94816
[18]	validation-rmse:5.94836
[19]	validation-rmse:5.94544
[20]	validation-rmse:5.94587
[21]	validation-rmse:5.94363
[22]	validation-rmse:5.94250
[23]	validation-rmse:5.94111
[24]	validation-rmse:5.93724
[25]	validation-rmse:5.93672
[26]	validation-rmse:5.93496
[27]	validation-rmse:5.93406
[28]	validation-rmse:5.93299
[29]	validation-rmse:5.93292
[30]	validation-rmse:5.93018
[31]	validation-rmse:5.93052
[32]	validation-rmse:5.93069
[33]	validation-rmse:5.92902
[34]	validation-rmse:5.92751
[35]	validation-rmse:5.



In [None]:
from mlflow.tracking import MLflowClient
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MLflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

In [None]:
client.create_experiment(name="my-cool-experiment")

In [None]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids="1",
    filter_string="metrics.rmse < 6.8",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']}")

In [None]:
import mlflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = ""
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

In [None]:
model_name = ""
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

new_stage = "Staging"
model_version = 4
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

from datetime import datetime
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"the model version {model_version} was transitioned to {new_stage} on {date}"
)