In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
!pip install wandb
import wandb

import pickle
from sklearn.feature_extraction import DictVectorizer
import click
import os

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.25.0-py2.py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [25]:
print(wandb.__version__)



0.15.3


In [4]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-03.parquet

--2023-06-06 17:34:09--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.51, 65.8.245.178, 65.8.245.171, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1254291 (1.2M) [binary/octet-stream]
Saving to: ‘green_tripdata_2022-01.parquet.1’


2023-06-06 17:34:09 (9.87 MB/s) - ‘green_tripdata_2022-01.parquet.1’ saved [1254291/1254291]

--2023-06-06 17:34:09--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2022-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.51, 65.8.245.178, 65.8.245.171, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1428262 (1.4M) [binary/octet-stream]
Saving to: ‘

In [16]:
def dump_pickle(obj,file_name,):
  with open(file_name,'wb') as f:
    pickle.dump(obj,f)

def read_dataframe(filename: str):
  df = pd.read_parquet(filename)

  df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"]
  df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
  df = df[(df.duration >= 1) & (df.duration <= 60)]

  categorical = ["PULocationID", "DOLocationID"]
  df[categorical] = df[categorical].astype(str)

  return df


def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
    categorical = ["PU_DO"]
    numerical = ["trip_distance"]
    dicts = df[categorical + numerical].to_dict(orient="records")
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv

def run_data_prep(
    wandb_project: str,
    wandb_entity: str,
    raw_data_path: str,
    dest_path: str,
    dataset: str = "green"):
    # Initialize a Weights & Biases run
    wandb.init(project=wandb_project, entity=wandb_entity, job_type="preprocess")

    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2022-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2022-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2022-03.parquet")
    )

    # Extract the target
    target = "tip_amount"
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

    artifact = wandb.Artifact("NYC-Taxi", type="preprocessed_dataset")
    artifact.add_dir(dest_path)
    wandb.log_artifact(artifact)

In [17]:
wandb_project = 'mlops_zoomcamp_wandb'
wandb_entity = 'camaganu'
raw_data_path = 'data'
dest_path = 'output'
dataset = "green"
run_data_prep(wandb_project,wandb_entity,raw_data_path,dest_path,dataset)



[34m[1mwandb[0m: Adding directory to artifact (./output)... Done. 0.0s


#train

In [18]:
import os
import pickle
import click

import wandb

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

        
def run_train(
    wandb_project: str,
    wandb_entity: str,
    data_artifact: str,
    max_depth: int,
    random_state: int,
):
    # Initialize a Weights & Biases run
    wandb.init(
        project=wandb_project,
        entity=wandb_entity,
        job_type="train",
        config={"max_depth": max_depth, "random_state": random_state},
    )

    # Fetch the preprocessed dataset from artifacts
    artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset")
    data_path = artifact.download()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    # Define the XGBoost Regressor Mode, train the model and perform prediction
    rf = RandomForestRegressor(max_depth=max_depth, random_state=random_state)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    mse = mean_squared_error(y_val, y_pred, squared=False)
    # TODO: Log `mse` to Weights & Biases under the key `"MSE"`
    wandb.log({'MSE':mse})
    
    with open("regressor.pkl", "wb") as f:
        pickle.dump(rf, f)

    artifact = wandb.Artifact('regressor_model',type = 'model')
    artifact.add_file('regressor.pkl')
    wandb.log_artifact(artifact)

    # TODO: Log `regressor.pkl` as an artifact of type `model`



In [19]:
name = 'camaganu/mlops_zoomcamp_wandb/NYC-Taxi:v0'
run_train(
    wandb_project,
    wandb_entity,
    data_artifact = name ,
    max_depth=10,
    random_state=22)



VBox(children=(Label(value='6.933 MB of 6.941 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.998828…

[34m[1mwandb[0m:   4 of 4 files downloaded.  


#Sweep

In [20]:
import os
import pickle
import click
from functools import partial

import wandb

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_train(data_artifact: str):
    wandb.init()
    config = wandb.config

    # Fetch the preprocessed dataset from artifacts
    artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset")
    data_path = artifact.download()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    # Define the XGBoost Regressor Mode, train the model and perform prediction
    # TODO: Pass the parameters n_estimators, min_samples_split, min_samples_leaf from `config` to `RandomForestRegressor`
    rf = RandomForestRegressor(max_depth=config.max_depth, n_estimators=config.n_estimators , min_samples_split = config.min_samples_split, 
                               min_samples_leaf = config.min_samples_leaf,random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    mse = mean_squared_error(y_val, y_pred, squared=False)
    wandb.log({"MSE": mse})

    with open("regressor.pkl", "wb") as f:
        pickle.dump(rf, f)

    artifact = wandb.Artifact(f"{wandb.run.id}-model", type="model")
    artifact.add_file("regressor.pkl")
    wandb.log_artifact(artifact)


SWEEP_CONFIG = {
    "method": "bayes",
    "metric": {"name": "MSE", "goal": "minimize"},
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 20,
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 50,
        },
        "min_samples_split": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10,
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 4,
        },
    },
}


def run_sweep(wandb_project: str, wandb_entity: str, data_artifact: str, count: int):
    sweep_id = wandb.sweep(SWEEP_CONFIG, project=wandb_project, entity=wandb_entity)
    wandb.agent(sweep_id, partial(run_train, data_artifact), count=count)



In [21]:
run_sweep(wandb_project, wandb_entity, name, 5)



Create sweep with ID: yvu163tq
Sweep URL: https://wandb.ai/camaganu/mlops_zoomcamp_wandb/sweeps/yvu163tq


[34m[1mwandb[0m: Agent Starting Run: lkuky08g with config:
[34m[1mwandb[0m: 	max_depth: 13
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 5
[34m[1mwandb[0m: 	n_estimators: 22


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='1.268 MB of 1.268 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.44883


[34m[1mwandb[0m: Agent Starting Run: fgvtk6gc with config:
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 2
[34m[1mwandb[0m: 	n_estimators: 20


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='0.257 MB of 0.257 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.45283


[34m[1mwandb[0m: Agent Starting Run: d09vi9pl with config:
[34m[1mwandb[0m: 	max_depth: 14
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 6
[34m[1mwandb[0m: 	n_estimators: 28


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='1.616 MB of 1.616 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.45123


[34m[1mwandb[0m: Agent Starting Run: o639p7bt with config:
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 6
[34m[1mwandb[0m: 	n_estimators: 47


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='0.310 MB of 0.310 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.46456


[34m[1mwandb[0m: Agent Starting Run: k7mtg200 with config:
[34m[1mwandb[0m: 	max_depth: 18
[34m[1mwandb[0m: 	min_samples_leaf: 4
[34m[1mwandb[0m: 	min_samples_split: 7
[34m[1mwandb[0m: 	n_estimators: 16


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='1.736 MB of 1.736 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.45419
