# Diving Deeper into Weights & Biases

<!--- @wandbcode{mlops-zoomcamp} -->

In this notebook, we will explore the following

* Versioning datasets using [Artifacts](https://docs.wandb.ai/guides/artifacts).
* Exploring and visualizing our datasets with [Tables](https://docs.wandb.ai/guides/data-vis).
* Baseline Experiment with a Random Forest Classification Model.

## Import the Libraries

In [None]:
import os
import pickle
#pip install wandb
import wandb
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## Logging Dataset to Artifacts

Download the `train.csv` and `test.csv` files from [Titanic - Machine Learning from Disaster](https://www.kaggle.com/competitions/titanic/data) and place them in the `data` directory.

In [None]:
# Initialize a WandB Run
wandb.init(project='mlops_zoomcamp_wandb',job_type='log_data')


# Log the `data` directory as an artifact
artifact = wandb.Artifact('Titanic',type = 'dataset',metadata = {"Source": "https://www.kaggle.com/competitions/titanic/data"})
artifact.add_dir('data')
wandb.log_artifact(artifact)

# End the WandB Run
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mchinmaymaganur[0m ([33mcamaganu[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


In [None]:
# Initialize a WandB Run
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

artifact = wandb.Artifact('Titanic', type='dataset', metadata={"Source": "https://www.kaggle.com/competitions/titanic/data"})
artifact.add_dir('data')
wandb.log_artifact(artifact)

# End the WandB Run
wandb.finish()

[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


VBox(children=(Label(value='0.092 MB of 0.092 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## Versioning the Data

In [None]:

run = wandb.init(project='mlops_zoomcamp_wandb',job_type='log_data')
artifact = run.use_artifact('camaganu/mlops_zoomcamp_wandb/Titanic:v0', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


Read the dataset files

In [None]:
import os

train = pd.read_csv(os.path.join(artifact_dir,'train.csv'))
test = pd.read_csv(os.path.join(artifact_dir,'test.csv'))

In [None]:
train_example_n = int(0.8*len(train))
val_example_n = len(train) - train_example_n

print(train_example_n,val_example_n)

712 179


In [None]:
train['split'] = ['train'] * train_example_n + ['val'] * val_example_n
train.to_csv('data/train.csv', encoding='utf-8', index=False)

In [None]:
artifact = wandb.Artifact('Titanic',type = 'dataset',metadata = {"Source": "https://www.kaggle.com/competitions/titanic/data"})
artifact.add_dir('data')
wandb.log_artifact(artifact)
wandb.finish()

[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## Explore the Dataset

In [None]:
run = wandb.init(project='mlops_zoomcamp_wandb',job_type='explore_data')
artifact = run.use_artifact('camaganu/mlops_zoomcamp_wandb/Titanic:v1', type='dataset')
artifact_dir = artifact.download()


# Read the files
train_val_df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(artifact_dir, "test.csv"))

[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [None]:
train_val_table = wandb.Table(dataframe = train_val_df)
test_table = wandb.Table(dataframe = test_df)

wandb.log({'Train_Val_Table':train_val_table,
           'Test_Table' : test_table})
wandb.finish()


## Fit a Baseline Model

In [None]:
run = wandb.init(project="mlops-zoomcamp-wandb",name='baseline_experinemt-2',job_type='train')
artifact = run.use_artifact('camaganu/mlops_zoomcamp_wandb/Titanic:v1', type='dataset')
artifact_dir = artifact.download()

train_val_df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(artifact_dir, "test.csv"))

[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [None]:
train_val_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,split
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [None]:
features = ['Pclass','Sex','SibSp','Parch']
X_train = pd.get_dummies(train_val_df[features][train_val_df["split"] == "train"])
X_val = pd.get_dummies(train_val_df[features][train_val_df["split"] == "val"])
y_train = train_val_df["Survived"][train_val_df["split"] == "train"]
y_val = train_val_df["Survived"][train_val_df["split"] == "val"]

In [None]:
model_params = {'n_estimators': 100 , "max_depth": 10, "random_state":1}
wandb.config = model_params

model = RandomForestClassifier(**model_params)
model.fit(X_train,y_train)
y_pred_train = model.predict(X_train)
y_probas_train = model.predict_proba(X_train)
y_pred_val = model.predict(X_val)
y_probas_val = model.predict_proba(X_val)

In [None]:
wandb.log({
    "Train/Accuracy": accuracy_score(y_train, y_pred_train),
    "Validation/Accuracy": accuracy_score(y_val, y_pred_val),
    "Train/Presicion": precision_score(y_train, y_pred_train),
    "Validation/Presicion": precision_score(y_val, y_pred_val),
    "Train/Recall": recall_score(y_train, y_pred_train),
    "Validation/Recall": recall_score(y_val, y_pred_val),
    "Train/F1-Score": f1_score(y_train, y_pred_train),
    "Validation/F1-Score": f1_score(y_val, y_pred_val),
})

In [None]:
label_names = ["Not-Survived", "Survived"]

wandb.sklearn.plot_class_proportions(y_train, y_val, label_names)
wandb.sklearn.plot_summary_metrics(model, X_train, y_train, X_val, y_val)
wandb.sklearn.plot_roc(y_val, y_probas_val, labels=label_names)
wandb.sklearn.plot_precision_recall(y_val, y_probas_val, labels=label_names)
wandb.sklearn.plot_confusion_matrix(y_val, y_pred_val, labels=label_names)



In [None]:
# Save your model
with open("random_forest_classifier.pkl", "wb") as f:
    pickle.dump(model, f)

# Log your model as a versioned file to Weights & Biases Artifact
artifact = wandb.Artifact(f"titanic-random-forest-model", type="model")
artifact.add_file("random_forest_classifier.pkl")
wandb.log_artifact(artifact)


# End the WandB Run
wandb.finish()

0,1
Train/Accuracy,▁
Train/F1-Score,▁
Train/Presicion,▁
Train/Recall,▁
Validation/Accuracy,▁
Validation/F1-Score,▁
Validation/Presicion,▁
Validation/Recall,▁

0,1
Train/Accuracy,0.8118
Train/F1-Score,0.73307
Train/Presicion,0.82143
Train/Recall,0.66187
Validation/Accuracy,0.82123
Validation/F1-Score,0.72881
Validation/Presicion,0.7963
Validation/Recall,0.67188


# Hyperparametr Optimization

In [None]:
def run_train():
  wandb.init()
  config = wandb.config
  artifact = wandb.use_artifact('camaganu/mlops_zoomcamp_wandb/Titanic:v1', type='dataset')
  artifact_dir = artifact.download()
  train_val_df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
  features = ['Pclass','Sex','SibSp','Parch']
  X_train = pd.get_dummies(train_val_df[features][train_val_df["split"] == "train"])
  X_val = pd.get_dummies(train_val_df[features][train_val_df["split"] == "val"])
  y_train = train_val_df["Survived"][train_val_df["split"] == "train"]
  y_val = train_val_df["Survived"][train_val_df["split"] == "val"]
  model = RandomForestClassifier(
       # n_estimators=config.n_estimators,
        max_depth=config.max_depth,
        min_samples_split=config.min_samples_split,
        min_samples_leaf=config.min_samples_leaf,
        bootstrap=config.bootstrap,
        warm_start=config.warm_start,
        class_weight=config.class_weight,
    )
  model.fit(X_train, y_train)

  # Make Predictions
  y_pred_train = model.predict(X_train)
  y_pred_val = model.predict(X_val)
  y_probas_val = model.predict_proba(X_val)

  # Log Metrics to Weights & Biases
  wandb.log({
      "Train/Accuracy": accuracy_score(y_train, y_pred_train),
      "Validation/Accuracy": accuracy_score(y_val, y_pred_val),
      "Train/Presicion": precision_score(y_train, y_pred_train),
      "Validation/Presicion": precision_score(y_val, y_pred_val),
      "Train/Recall": recall_score(y_train, y_pred_train),
      "Validation/Recall": recall_score(y_val, y_pred_val),
      "Train/F1-Score": f1_score(y_train, y_pred_train),
      "Validation/F1-Score": f1_score(y_val, y_pred_val),
  })

  # Plot plots to Weights & Biases
  label_names = ["Not-Survived", "Survived"]
  wandb.sklearn.plot_class_proportions(y_train, y_val, label_names)
  wandb.sklearn.plot_summary_metrics(model, X_train, y_train, X_val, y_val)
  wandb.sklearn.plot_roc(y_val, y_probas_val, labels=label_names)
  wandb.sklearn.plot_precision_recall(y_val, y_probas_val, labels=label_names)
  wandb.sklearn.plot_confusion_matrix(y_val, y_pred_val, labels=label_names)

  # Save your model
  with open("random_forest_classifier.pkl", "wb") as f:
      pickle.dump(model, f)

  # Log your model as a versioned file to Weights & Biases Artifact
  artifact = wandb.Artifact("titanic-random-forest-model", type="model")
  artifact.add_file("random_forest_classifier.pkl")
  wandb.log_artifact(artifact)


SWEEP_CONFIG ={
    'method' :'bayes',
    'metric' : {'name':"Validation/Accuracy" , 'goal':'maximize'},
    'parameters':
      {
      'max_depth':{'max':20,'min':1,'distribution':'int_uniform'},
      'n_estimators':{'distribution':'int_uniform','min':10,'max':100},
      'min_samples_split':{'distribution':'int_uniform','min':2,'max':10},
      'min_samples_leaf':{'distribution':'int_uniform','min':1,'max':4},
      "bootstrap": {"values": [True, False]},
      "warm_start": {"values": [True, False]},
      "class_weight": {"values": ["balanced", "balanced_subsample"]}
      }
}

sweep_id = wandb.sweep(SWEEP_CONFIG,project = 'mlops_zoomcamp_wandb')
wandb.agent(sweep_id,run_train,count=4)


Create sweep with ID: pauex9kb
Sweep URL: https://wandb.ai/camaganu/mlops_zoomcamp_wandb/sweeps/pauex9kb


[34m[1mwandb[0m: Agent Starting Run: m2k3g30l with config:
[34m[1mwandb[0m: 	bootstrap: True
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_depth: 13
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 3
[34m[1mwandb[0m: 	n_estimators: 43
[34m[1mwandb[0m: 	warm_start: True


[34m[1mwandb[0m:   2 of 2 files downloaded.  
  warn(


0,1
Train/Accuracy,▁
Train/F1-Score,▁
Train/Presicion,▁
Train/Recall,▁
Validation/Accuracy,▁
Validation/F1-Score,▁
Validation/Presicion,▁
Validation/Recall,▁

0,1
Train/Accuracy,0.80337
Train/F1-Score,0.74453
Train/Presicion,0.75556
Train/Recall,0.73381
Validation/Accuracy,0.82682
Validation/F1-Score,0.75591
Validation/Presicion,0.7619
Validation/Recall,0.75


[34m[1mwandb[0m: Agent Starting Run: 9hko9od3 with config:
[34m[1mwandb[0m: 	bootstrap: False
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 7
[34m[1mwandb[0m: 	n_estimators: 44
[34m[1mwandb[0m: 	warm_start: True


[34m[1mwandb[0m:   2 of 2 files downloaded.  
  warn(


0,1
Train/Accuracy,▁
Train/F1-Score,▁
Train/Presicion,▁
Train/Recall,▁
Validation/Accuracy,▁
Validation/F1-Score,▁
Validation/Presicion,▁
Validation/Recall,▁

0,1
Train/Accuracy,0.80337
Train/F1-Score,0.74638
Train/Presicion,0.75182
Train/Recall,0.74101
Validation/Accuracy,0.82682
Validation/F1-Score,0.75591
Validation/Presicion,0.7619
Validation/Recall,0.75


[34m[1mwandb[0m: Agent Starting Run: w82bx47x with config:
[34m[1mwandb[0m: 	bootstrap: True
[34m[1mwandb[0m: 	class_weight: balanced_subsample
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	n_estimators: 30
[34m[1mwandb[0m: 	warm_start: False


[34m[1mwandb[0m:   2 of 2 files downloaded.  


0,1
Train/Accuracy,▁
Train/F1-Score,▁
Train/Presicion,▁
Train/Recall,▁
Validation/Accuracy,▁
Validation/F1-Score,▁
Validation/Presicion,▁
Validation/Recall,▁

0,1
Train/Accuracy,0.80337
Train/F1-Score,0.74453
Train/Presicion,0.75556
Train/Recall,0.73381
Validation/Accuracy,0.82682
Validation/F1-Score,0.75591
Validation/Presicion,0.7619
Validation/Recall,0.75


[34m[1mwandb[0m: Agent Starting Run: 1ikre6is with config:
[34m[1mwandb[0m: 	bootstrap: True
[34m[1mwandb[0m: 	class_weight: balanced_subsample
[34m[1mwandb[0m: 	max_depth: 18
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 7
[34m[1mwandb[0m: 	n_estimators: 16
[34m[1mwandb[0m: 	warm_start: False


[34m[1mwandb[0m:   2 of 2 files downloaded.  


0,1
Train/Accuracy,▁
Train/F1-Score,▁
Train/Presicion,▁
Train/Recall,▁
Validation/Accuracy,▁
Validation/F1-Score,▁
Validation/Presicion,▁
Validation/Recall,▁

0,1
Train/Accuracy,0.80758
Train/F1-Score,0.74954
Train/Presicion,0.76208
Train/Recall,0.73741
Validation/Accuracy,0.81564
Validation/F1-Score,0.736
Validation/Presicion,0.7541
Validation/Recall,0.71875
