In [None]:
# !apt-get update && apt-get install -y git-all

In [None]:
# !git init && dvc init

In [None]:
# import git 
# g = git.Git(os.getcwd()) 
# hexshas = g.log('--pretty=%H','--follow','--',"Y_data.npy.dvc").split('\n') 

In [None]:
# hexshas

In [None]:
# !git config --global user.email "danyail@mail.com"
# !git config --global user.name "danzz006"

## Imports

In [None]:
from sklearn.datasets import make_classification
from sklearn.datasets import make_circles
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path
from PIL import Image
import mlflow

## Initialize MLflow Experiment

**mlflow.create_experiment()** creates a new experiment and returns its ID. Runs can be launched under the experiment by passing the experiment ID to mlflow.start_run.

**mlflow.set_tracking_uri()** connects to a tracking URI. The URI can either be a HTTP/HTTPS URI for a remote server, a database connection string, or a local path to log data to a directory.

In [None]:
'''When creating new experiment'''
# experiment_id = mlflow.create_experiment(
#     "Linear classifier", 
#     artifact_location=Path.cwd().joinpath("./mlruns").as_uri(),
#     tags={"version":"v1", "priority":"P1"}
# )

In [None]:
mlflow.set_tracking_uri("sqlite:///mlruns/mlruns.db")

In [None]:
experiment = mlflow.set_experiment("Linear classifier")

**mlflow.start_run()** returns the currently active run (if one exists), or starts a new run and returns a mlflow.ActiveRun object.

**mlflow.log_params()** logs a single key-value params in the currently active run. Like in the following examples we are using it to log lr, batch size and epochs.

**mlflow.log_metric()** logs a single key-value metric. The value must always be a number.

In [None]:
class LinearClassifier:
    def __init__(self, mlflow_exp_id):
        self.mlflow_exp_id = mlflow_exp_id
        # pass
    def sigmoid(self,z):
        return 1.0/(1. + np.exp(-z))

    def loss(self,y,y_ht):
        loss =  -np.mean( y*(np.log(y_ht)) - (1-y)*np.log(1-y_ht) )
        return loss

    def gradient(self,x,y,y_ht):
        m = x.shape[0]
        dw = (1/m)*np.dot(x.T,(y_ht-y)) #wrt w
        db = (1/m)*np.sum((y_ht-y)) # wrt bias
        return dw,db 

    def plot_dec_boundry(self,x,w,b,y,debug, save):
        x1 = [min(x[:,0]), max(x[:,0])]
        m = -w[0]/w[1]
        c = -b/w[1]
        x2 = m*x1 + c
        plt.plot(x[:,0][y==0], x[:,1][y==0],'r^')
        plt.plot(x[:,0][y==1], x[:,1][y==1],'bs')
        plt.plot(x1,x2,'y-')
        if debug:
            plt.show()
        if save:
            plt.savefig("tmp.png")

    def normalize(self,x):
        m,n = x.shape   #m : trg exmple, n: features
        for i in range(n):
            x = (x - x.mean(axis=0))/x.std(axis=0)
        return x    

    def train(self, x,y,bs,epochs,lr):
        m,n = x.shape
        w = np.zeros((n,1))  ## or random ?
        b = 0
        y_ = y
        y = y.reshape(m,1)
        x = self.normalize(x)

        losses = []
        params = {"learning_rate": lr, "batch_size": bs, "epochs": epochs}
        with mlflow.start_run(experiment_id=self.mlflow_exp_id, description="Linear classifier for data points", run_name=f"LC-{epochs}-{bs}-{lr}") as run:
            mlflow.log_params(params)        
            for epoch in range(epochs):
                for i in range((m-1)//bs+1):
                    start_i = i*bs
                    end_i = start_i + bs
                    xb = x[start_i:end_i]
                    yb = y[start_i:end_i]

                    y_ht = self.sigmoid(np.dot(xb,w)+b)

                    dw,db = self.gradient(xb,yb,y_ht)

                    w -=lr*dw
                    b -=lr*db

                l = self.loss(y,self.sigmoid(np.dot(x,w)+b))
                mlflow.log_metric("loss", l)
                losses.append(l)

            y_ht = self.predict(x, w, b)
            acc = self.accuracy(y, y_ht)
            mlflow.log_metric("Acc", acc)
            self.plot_dec_boundry(x,w,b,y_,False, True)
            mlflow.log_image(Image.open("tmp.png"), "dec_boundry.png")
            
        return w,b,losses

    def step_func(self,z):
        return 1.0 if (z > 0) else 0.0

    def percep_train(self,x, y, epochs, lr):
    
        m, n = x.shape
        
        w = np.zeros((n+1,1))   ## weight plus bias
        
        losses = []
        params = {"learning_rate": lr, "epochs": epochs}
        with mlflow.start_run(experiment_id=self.mlflow_exp_id, description="Linear classifier for data points", run_name=f"LC-percep-{epochs}-{lr}") as run:
            mlflow.log_params(params)
            for epoch in range(epochs):
                n_miss = 0
                for idx, x_i in enumerate(x):
                    x_i = np.insert(x_i, 0, 1).reshape(-1,1)
                    y_hat = self.step_func(np.dot(x_i.T, w))

                    if (np.squeeze(y_hat) - y[idx]) != 0:
                        w += lr*((y[idx] - y_hat)*x_i)
                        n_miss += 1
                        
                mlflow.log_metric("loss", n_miss)
                losses.append(n_miss)
                
            w = np.array([ [w[1][0]],[w[2][0]] ])
            b = w[0][0]
            y_ht = self.predict(x, w, b)
            acc = self.accuracy(y, y_ht)
            mlflow.log_metric("Acc", acc)
            self.plot_dec_boundry(x,w,b,y,False, True)
            mlflow.log_image(Image.open("tmp.png"), "dec_boundry.png")
        
        return w,b, losses
        # return np.array([ [w[1][0]],[w[2][0]] ]),w[0][0], losses
    

    def predict(self,x,w,b):

        x = self.normalize(x)
        preds = self.sigmoid((np.dot(x,w)+b))
        pred_class = []

        pred_class = [1 if i>0.5 else 0 for i in preds]

        return np.array(pred_class)
    
    def accuracy(self,y, y_ht):
        accuracy = np.sum(y == y_ht) / len(y)
        return accuracy


In [None]:
p = LinearClassifier(experiment.experiment_id)

In [None]:
x,Y = make_classification(n_features=2,n_classes=2,n_samples=100,n_redundant=0,n_clusters_per_class=1)

In [None]:
w,b,loss = p.train(x,Y,bs=10,epochs=40,lr=0.001)

In [None]:
class_pred = p.predict(x,w,b)

In [None]:
print('Accuracy= ',p.accuracy(Y,class_pred) * 100,'%')

In [None]:
plt.subplot(1,2,1)
p.plot_dec_boundry(x,w,b,Y,0, False)
plt.subplot(1,2,2)
plt.plot(loss)
plt.suptitle(['Accuray' + str(p.accuracy(Y,class_pred) * 100)])
plt.show()

## Tracking changes in data with DVC

**DVC** is a free, open-source command line tool that doesn't require databases, servers, or any other special services.

It help keep your projects readable with stable file names — they don't need to change because they represent variable data.

In [None]:
with open("X_data.npy", "wb") as fs_x:
    np.save(fs_x, x)

with open("Y_data.npy", "wb") as fs_y:
    np.save(fs_y, Y)

1) **dvc add** captures the current state of the dataset. **.dvc** are created by these commands as data placeholders that can be versioned with Git. They contain the information needed to track the target data over time.

2) Next, with **git** we commit the currest state.

In [None]:
!dvc add X_data.npy Y_data.npy
!git add X_data.npy.dvc Y_data.npy.dvc .gitignore
!git commit -m "version 1.0.0 of data"
!git log

In [None]:
x,Y = make_classification(n_features=2,n_classes=2,n_samples=200,n_redundant=0,n_clusters_per_class=1)

In [None]:
w1,b1,loss1 = p.percep_train(x,Y,epochs=100,lr=0.01)

In [None]:
class_pred = p.predict(x,w1,b1)
print('Accuracy= ',p.accuracy(Y,class_pred) * 100,'%')

In [None]:
plt.subplot(1,2,1)
p.plot_dec_boundry(x,w1,b1,Y,0, False)
plt.subplot(1,2,2)
plt.plot(loss1)
plt.suptitle(['Accuray' + str(p.accuracy(Y,class_pred) * 100)])
plt.show()

In [None]:
with open("X_data.npy", "wb") as fs_x:
    np.save(fs_x, x)

with open("Y_data.npy", "wb") as fs_y:
    np.save(fs_y, Y)

Saving and committing another version of data with increased samples

In [None]:
!dvc add X_data.npy Y_data.npy
!git add X_data.npy.dvc Y_data.npy.dvc .gitignore
!git commit -m "version 1.0.1 of data with increased samples"
!git log

## Reverting back data

After long experiment we want to recreate the results from our first experiment. We can acquire the hyperparameters from the **mlflow** and data from **dvc**

In [None]:
!git log

First we checkout with **git** to the previous version (of .dvc files), then we run **dvc checkout** to get the right data into the workspace.

In [None]:
!git checkout 059ce205b4be748d31fab6b18aa4d5028ebc48c5
!dvc checkout

In [None]:
with open("X_data.npy", "rb") as fs_x:
    X = np.load(fs_x)
    
with open("Y_data.npy", "rb") as fs_y:
    Y = np.load(fs_y)

In [None]:
X.shape, Y.shape

In [None]:
w1,b1,loss1 = p.percep_train(X,Y,epochs=100,lr=0.01)

In [None]:
class_pred = p.predict(x,w1,b1)
print('Accuracy= ',p.accuracy(Y,class_pred) * 100,'%')