<img src="./images/logo.svg" alt="lakeFS logo" width=300/> 

# ML Experimentation 02 (Wine Quality)

In this tutorial, you will learn how to version your ML training data, model artifacts, metrics and your training code together with lakeFS. We will be using [Wine-Quality-Dataset](https://archive.ics.uci.edu/ml/datasets/wine+quality) for the multi class classification 

To learn more about how lakeFS can be used for ML experimentation and reproducibility, check out the [published blog](https://lakefs.io/blog/building-an-ml-experimentation-platform-for-easy-reproducibility-using-lakefs/).

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFODNN7EXAMPLE'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "ml-experimentation-wine-quality"

### Create lakeFSClient

In [None]:
import lakefs_client
from lakefs_client.models import *
from lakefs_client.client import LakeFSClient

# lakeFS credentials and endpoint
configuration = lakefs_client.Configuration()
configuration.username = lakefsAccessKey
configuration.password = lakefsSecretKey
configuration.host = lakefsEndPoint

lakefs = LakeFSClient(configuration)

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials…")
try:
    v=lakefs.config.get_lake_fs_version()
except:
    print("🛑 failed to get lakeFS version")
else:
    print(f"…✅lakeFS credentials verified\n\nℹ️lakeFS version {v.version}")

### Define lakeFS Repository

In [None]:
from lakefs_client.exceptions import NotFoundException

try:
    repo=lakefs.repositories.get_repository(repo_name)
    print(f"Found existing repo {repo.id} using storage namespace {repo.storage_namespace}")
except NotFoundException as f:
    print(f"Repository {repo_name} does not exist, so going to try and create it now.")
    try:
        repo=lakefs.repositories.create_repository(repository_creation=RepositoryCreation(name=repo_name,
                                                                                                storage_namespace=f"{storageNamespace}/{repo_name}"))
        print(f"Created new repo {repo.id} using storage namespace {repo.storage_namespace}")
    except lakefs_client.ApiException as e:
        print(f"Error creating repo {repo_name}. Error is {e}")
        os._exit(00)
except lakefs_client.ApiException as e:
    print(f"Error getting repo {repo_name}: {e}")
    os._exit(00)

### Imports

In [None]:
from datetime import date, time, datetime

In [None]:
import boto3
import io
import csv
import duckdb
import s3fs
import json
import tempfile
import joblib
import os

In [None]:
import numpy as np
import pandas as pd
from collections import Counter

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA

%matplotlib inline


### Configure boto3 client

In [None]:
s3_client = boto3.client('s3',
    endpoint_url=lakefsEndPoint,
    aws_access_key_id=lakefsAccessKey,
    aws_secret_access_key=lakefsSecretKey)

s3_resource = boto3.resource('s3',
    endpoint_url=lakefsEndPoint,
    aws_access_key_id=lakefsAccessKey,
    aws_secret_access_key=lakefsSecretKey)

In [None]:
s3 = s3fs.S3FileSystem(anon=False,
                      key=lakefsAccessKey,
                      secret=lakefsSecretKey,
                      client_kwargs={'endpoint_url': lakefsEndPoint})


---

# Main Tutorial starts here 🚦 👇🏻

# Creating Ingest branch

In [None]:
ingest_branch = "ingest-data"
prod_branch = "main"

In [None]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=ingest_branch, 
                                                                    source=prod_branch)
                             )

# Upload wine-quality-dataset to ingest branch

In [None]:
ingest_data = "wine-quality-white-and-red.csv"
ingest_path = f'dt={str(date.today())}/raw/{ingest_data}' 
ingest_path

In [None]:
with open(f'/data/{ingest_data}', 'rb') as f:     
    lakefs.objects.upload_object(repository=repo_name, 
                                 branch=ingest_branch, 
                                 path=ingest_path, 
                                 content=f
                                )


In [None]:
lakefs.branches.diff_branch(repository=repo_name, 
                            branch=ingest_branch).results


In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=ingest_branch,
                      commit_creation=CommitCreation(
                          message="wine quality data uploaded to ingest branch")
                     )

# Data Exploration

In [None]:
filepath = f"s3://{repo_name}/{ingest_branch}/{ingest_path}"
print(filepath)

obj = s3_client.get_object(Bucket=repo_name, Key=f'{ingest_branch}/{ingest_path}')
wine_df = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
wine_df.info()

In [None]:
wine_df.head()

In [None]:
sns.pairplot(wine_df)

In [None]:
Counter(wine_df['quality'])

In [None]:
sns.countplot(x='quality', data=wine_df)

## Utils

In [None]:
def scale_input(x):
    sc = StandardScaler()
    x = sc.fit_transform(x)
    return x
    

In [None]:
def plot_pca(pca):
    plt.figure(figsize=(10,10))
    plt.plot(np.cumsum(pca.explained_variance_ratio_), 'ro-')
    plt.grid()
    return


# Experimentation Begins

# Experiment 1

- Preprocess - Standard Scaler, PCA
- Training - RandomForestClassifier
- Evaluation - F1 score
- Labels - Multiclass classification (quality: 1 to 10)

In [None]:
config = {
    'branch_name': 'exp-1',
    'drop_columns': ['type'],
    'f1_average': 'micro', #imbalance class problem
    'is_scale_input': True,
    'is_pca': True,
    'test_size': '0.25'
}
params1 = config

filepath = f"s3://{repo_name}/{params1['branch_name']}/{ingest_path}"

### Create new branch

In [None]:

lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=params1['branch_name'], 
                                                                    source=ingest_branch)
                             )


### Save configs

In [None]:
config_df = pd.DataFrame.from_dict(params1)
config_df

In [None]:
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/config/config.csv",'w') as f:
    config_df.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params1['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded training configs")
                     )

### Create model features

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=f"{params1['branch_name']}/{ingest_path}")
wine_df = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')

In [None]:
wine_df.head()

In [None]:
wine_df.drop(columns=['type'], inplace=True)

In [None]:
x = wine_df.iloc[:,:11]
y = wine_df['quality']
y_col = ['quality']
x_cols = [col for col in x.columns]

In [None]:
if params1['is_scale_input']:
    x = scale_input(x)

In [None]:
x_cols

In [None]:
if params1['is_pca']:
    pca = PCA()
    x_pca = pca.fit_transform(x)
    plot_pca(pca)

In [None]:
if params1['is_pca']:
    n_comp = 6
    pca_new = PCA(n_components=n_comp)
    x = pca_new.fit_transform(x)
    x_cols = [f"pca_{i}" for i in range(n_comp)]

In [None]:
x_cols

### Save features

In [None]:
features = pd.DataFrame(x, columns = x_cols)
label = pd.DataFrame(y, columns = y_col)

features.head()

In [None]:
label.head()

In [None]:
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/features/features.csv",'w') as f:
    features.to_csv(f)

In [None]:
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/features/label.csv",'w') as f:
    label.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params1['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded features")
                     )

### Train and evaluate

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
x_train = pd.DataFrame(x_train, columns = x_cols)
x_test = pd.DataFrame(x_test, columns = x_cols)
y_train = pd.DataFrame(y_train, columns = y_col)
y_test = pd.DataFrame(y_test, columns = y_col)

In [None]:
type(x_train)

In [None]:
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/preprocessed/x_train.csv",'w') as f:
    x_train.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/preprocessed/x_test.csv",'w') as f:
    x_test.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/preprocessed/y_train.csv",'w') as f:
    y_train.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/preprocessed/y_test.csv",'w') as f:
    y_test.to_csv(f)

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_predict=rf.predict(x_test)

rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_f1_score = f1_score(y_test, rf_predict, average=params1['f1_average'])
print(rf_conf_matrix)
print("\nF1-score: \t", round(rf_f1_score*100,2))


### Save model artifacts

In [None]:
output_file = os.path.join(f"s3://{repo_name}/{params1['branch_name']}/dt={str(date.today())}/artifacts/", "model.joblib")
print(output_file)

with s3.open(output_file, 'wb') as f:
    joblib.dump(rf, f) 

# # Read
# with s3.open(output_file, 'rb') as f:
#     rf = joblib.load(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params1['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded model artifacts")
                     )

### Save model metrics

In [None]:
metrics_df = pd.DataFrame.from_dict({'f1': [rf_f1_score]})
with s3.open(f"/{repo_name}/{params1['branch_name']}/dt={str(date.today())}/metrics/scores.csv",'w') as f:
    metrics_df.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params1['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded training metrics")
                     )

# Experiment 2

- Preprocess - Regroup labels
- Training - RandomForestClassifier
- Evaluation - F1 score
- Labels - Multiclass classification (quality: Bad, Okay, Good)

In [None]:
config = {
    'branch_name': 'exp-2',
    'drop_columns': ['type'],
    'f1_average': 'weighted', #imbalance class problem
    'is_scale_input': False,
    'is_pca': False,
    'test_size': '0.25'
}
params2 = config

filepath = f"s3://{repo_name}/{params2['branch_name']}/{ingest_path}"

### Create new branch

In [None]:
lakefs.branches.create_branch(repository=repo_name, 
                              branch_creation=BranchCreation(name=params2['branch_name'], 
                                                                    source=ingest_branch)
                             )


### Save configs

In [None]:
config_df = pd.DataFrame.from_dict(params2)
config_df

In [None]:
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/config/config.csv",'w') as f:
    config_df.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params2['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded training configs")
                     )

### Create model features

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=f"{params2['branch_name']}/{ingest_path}")
wine_df = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')

In [None]:
wine_df.head()

In [None]:
wine_df.drop(columns=['type'], inplace=True)

In [None]:
reviews = []
for i in wine_df['quality']:
    if i >= 1 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 6:
        reviews.append('2')
    elif i >= 7 and i <= 10:
        reviews.append('3')
wine_df['reviews'] = reviews

In [None]:
x = wine_df.iloc[:,:11]
y = wine_df['reviews']
y_col = ['reviews']
x_cols = [col for col in x.columns]

In [None]:
x.head()

In [None]:
Counter(y)

In [None]:
sns.countplot(x=y, data=wine_df)

### Save features

In [None]:
features = pd.DataFrame(x, columns = x_cols)
label = pd.DataFrame(y, columns = y_col)

features.head()

In [None]:
label.head()

In [None]:
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/features/features.csv",'w') as f:
    features.to_csv(f)

In [None]:
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/features/label.csv",'w') as f:
    label.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params2['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded features")
                     )

### Train and evaluate

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

x_train = pd.DataFrame(x_train, columns = x_cols)
x_test = pd.DataFrame(x_test, columns = x_cols)
y_train = pd.DataFrame(y_train, columns = y_col)
y_test = pd.DataFrame(y_test, columns = y_col)

In [None]:
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/preprocessed/x_train.csv",'w') as f:
    x_train.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/preprocessed/x_test.csv",'w') as f:
    x_test.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/preprocessed/y_train.csv",'w') as f:
    y_train.to_csv(f)
    
    
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/preprocessed/y_test.csv",'w') as f:
    y_test.to_csv(f)

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_predict=rf.predict(x_test)

rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_f1_score = f1_score(y_test, rf_predict, average='weighted')
print(rf_conf_matrix)
print("F1-score: \t", round(rf_f1_score*100,2))

### Save model artifacts

In [None]:
output_file = os.path.join(f"s3://{repo_name}/{params2['branch_name']}/dt={str(date.today())}/artifacts/", "model.joblib")
print(output_file)

with s3.open(output_file, 'wb') as f:
    joblib.dump(rf, f) 

# # Read
# with s3.open(output_file, 'rb') as f:
#     rf = joblib.load(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params2['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded model artifacts")
                     )

### Save model metrics

In [None]:
metrics_df = pd.DataFrame.from_dict({'f1': [rf_f1_score]})
with s3.open(f"/{repo_name}/{params2['branch_name']}/dt={str(date.today())}/metrics/scores.csv",'w') as f:
    metrics_df.to_csv(f)

In [None]:
lakefs.commits.commit(repository=repo_name,
                      branch=params2['branch_name'],
                      commit_creation=CommitCreation(
                          message="Uploaded training metrics")
                     )

### Reproduce an experiment with lakeFS tag

In [None]:
tag_branch = "exp-1"
tag = f'{datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}_{tag_branch}'
tag

In [None]:
lakefs.tags.create_tag(
    repository=repo_name,
    tag_creation=TagCreation(
        id=tag, 
        ref=tag_branch))

In [None]:
features_path = f"{tag}/dt={str(date.today())}/features/features.csv"
label_path = f"{tag}/dt={str(date.today())}/features/label.csv"
print(features_path,"\n",label_path)

In [None]:
x_train_path = f"{tag}/dt={str(date.today())}/preprocessed/x_train.csv"
x_test_path = f"{tag}/dt={str(date.today())}/preprocessed/x_test.csv"
y_train_path = f"{tag}/dt={str(date.today())}/preprocessed/y_train.csv"
y_test_path = f"{tag}/dt={str(date.today())}/preprocessed/y_test.csv"
print(x_train_path)

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=x_train_path)
x_train = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
x_train.drop(columns=['Unnamed: 0'], inplace=True)
x_train.head()

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=x_test_path)
x_test = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
x_test.drop(columns=['Unnamed: 0'], inplace=True)
x_test.head()

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=y_train_path)
y_train = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
y_train.drop(columns=['Unnamed: 0'], inplace=True)
y_train.head()

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=y_test_path)
y_test = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
y_test.drop(columns=['Unnamed: 0'], inplace=True)
y_test.head()

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=features_path)
features = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
features.drop(columns=['Unnamed: 0'], inplace=True)
features.head()

In [None]:
obj = s3_client.get_object(Bucket=repo_name, Key=label_path)
label = pd.read_csv(io.BytesIO(obj['Body'].read()), header='infer')
label.drop(columns=['Unnamed: 0'], inplace=True)
label.head()

In [None]:
output_file = os.path.join(f"s3://{repo_name}/{tag}/dt={str(date.today())}/artifacts/", "model.joblib")
print(output_file)

In [None]:
# Read
with s3.open(output_file, 'rb') as f:
    rf = joblib.load(f)

In [None]:
rf_predict=rf.predict(x_test)

rf_conf_matrix = confusion_matrix(y_test, rf_predict)
rf_f1_score = f1_score(y_test, rf_predict, average=params1['f1_average'])
print(rf_conf_matrix)
print("\nF1-score: \t", round(rf_f1_score*100,2))

In [None]:
#END