In [1]:
import os
import sys
import json

import pandas as pd
import numpy as np
import pickle

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
DATA_PATH = os.path.join(os.path.abspath("../.."), "data/")
MODEL_PATH = os.path.join(os.path.abspath("../.."), "models/")
METRICS_PATH = os.path.join(os.path.abspath("../.."), "metrics/")

In [3]:
# get dataset
wine = load_wine()

# load dataframe
data = pd.DataFrame(data= np.c_[wine['data'], wine['target']],
                     columns= wine['feature_names'] + ['target'])


In [4]:
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.0


In [5]:
data.shape

(178, 14)

In [6]:
train_df, test_df = train_test_split(data, test_size=0.3)

In [7]:
# save data locally
train_df.to_parquet(DATA_PATH + "df_train.parquet")
test_df.to_parquet(DATA_PATH + "df_test.parquet")

# Setting Up DVC

After having saved the data locally/in the cloud for the first time, we have to initialize DVC the same way we do with **git**, adding a remote repository, and finally start tracking the data.

1. Initalize DVC: *dvc init* (creates a .dvcignore and a config file)
2. Setup remote repository (local, GCS, s3, Azure): *dvc remote add -d [name] <path/to/storage>* (e.g., dvc remote add -d localremote ./data-remote)
3. Commit Initialization of DVC: *git commit -m "DVC initialized"*

# Let´s train a model

In [8]:
x_train, y_train = train_df.drop("target", axis=1), train_df["target"]
x_test, y_test = test_df.drop("target", axis=1), test_df["target"]

In [9]:
RFC = RandomForestClassifier()
RFC.fit(x_train, y_train)
predictions = RFC.predict(x_test)

In [10]:
acc = accuracy_score(y_test, predictions)

In [12]:
acc

0.9814814814814815

In [13]:
metrics_dict = {"accuracy": acc}

In [14]:
with open(MODEL_PATH+'model.pkl', 'wb') as model_file:
    pickle.dump(RFC, model_file)

In [15]:
with open(METRICS_PATH+'accuracy.json', 'w') as metrics_file:
    json.dump(metrics_dict, metrics_file)

# Track data, model, and corresponding metrics

1. Start data, model, metrics tracking: *dvc add <path/to/data> add path/to/model path/to/metrics** (metadata is generated as .dvc file) **Warning:** arguments of *dvc add* should be the actual files, not the .dvc metadata
2. Commit autogenerated .dvc, .gitignore file to git: *git commit -m "Original Dataset"*
3. (Optional) Setup a tag for the commit: *git tag -a "<version>" -m "<model version>, <dataset-version>"* (e.g., *git tag -a "v1.0" -m "model v1.0, all-features*". You can check tag list with __*git tag -n*__
4. Push changes: *git push && dvc push*

# Model and Data Changes

In [77]:
data = data.drop(["ash", "magnesium", "flavanoids"], axis=1)
data.to_parquet(DATA_PATH + "data.parquet")

In [78]:
train_df, test_df = train_test_split(data, test_size=0.3)

x_train, y_train = train_df.drop("target", axis=1), train_df["target"]
x_test, y_test = test_df.drop("target", axis=1), test_df["target"]

In [79]:
RFC = RandomForestClassifier()
RFC.fit(x_train, y_train)
predictions = RFC.predict(x_test)

In [80]:
acc = accuracy_score(y_test, predictions)

metrics_dict = {"accuracy": acc}

In [89]:
with open(MODEL_PATH+'model.pkl', 'wb') as model_file:
    pickle.dump(RFC, model_file)
    
with open(METRICS_PATH+'accuracy.json', 'w') as metrics_file:
    json.dump(metrics_dict, metrics_file)

In [88]:
metrics_dict

{'accuracy': 0.9814814814814815}

In [85]:
data.to_parquet(DATA_PATH + "data.parquet")

# Repeat the DVC Workflow

After having saved the data locally/in the cloud for the first time, we have to initialize DVC the same way we do with **git**, adding a remote repository, and finally start tracking the data.

1. Track Data: *dvc add <path/to/data>* (metadata is generated as .dvc file)
2. Commit autogenerated .dvc file to git: *git commit -m "Original Dataset"*
3. (Optional) Set a tag to git commit for easier retrieval: *git tag <tagname>*
4. Push changes to github and to remote dvc repo: *git push && dvc push*

# Revert to Original Version

1. *git checkout [tag-name]*
2. *dvc checkout*

# Losing Data

1. git checkout metrics/accuracy.json.dvc models/model.pkl.dvc data/data.parquet.dvc
2. dvc pull metrics/accuracy.json.dvc models/model.pkl.dvc data/data.parquet.dvc