### Import Libraries

In [1]:
import pickle
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score

import pandas as pd
import datetime

### Code

The following functions are used for creating binary classifier<br/>
1. **transform_data** - Transforms Data using a MinMaxScaler, fits the data and saves the scaler
2. **train_model** - Performs Test/Train Split, trains a KNN Classifier. Returns Model and metrics.
3. **scale_data** - Scale the given input as per the given scaler.

In [2]:
def transform_data(data, scaler_name):
    X = data.drop(columns=["label"], axis=1)
    y = data[["label"]]

    scaler = None

    try:
        with open("./"+scaler_name+".pkl", "rb") as fp:
            scaler = pickle.load(fp)
    except FileNotFoundError as e:
        print("Scaler Not Found! Will create a new one.")
        scaler = MinMaxScaler()

    scaler.partial_fit(X)
    X = scaler.transform(X)

    scale = {
        "min": scaler.data_min_,
        "max": scaler.data_max_
    }

    with open("./"+scaler_name+".pkl", "wb") as fp:
        print('Saving Scaler...', scale)
        pickle.dump(scaler, fp)

    return X, y["label"]

In [3]:
def train_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "cv_score": cross_val_score(model, X, y, cv=10),
        "class_report": classification_report(y_test, y_pred, output_dict=True),
        "confusion_matrix": confusion_matrix(y_test, y_pred)
    }
    return metrics

In [4]:
def scale_data(scaler_name,datadf):
    scaler = None
    with open("./"+scaler_name+".pkl", "rb") as fp:
        scaler = pickle.load(fp)
    data = scaler.transform(datadf)
    return data

---

### Training and Tracking  - KNN

Train model on individual data and then test the model on testdata

In [5]:
import mlflow
from mlflow.models.signature import infer_signature
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("Red-Blue-Experiment")

MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=Red-Blue-Experiment (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002EAB70611F0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [6]:
datas = ["./data/sample"+str(i)+".csv" for i in range(1,11)]
print(datas)

['./data/sample1.csv', './data/sample2.csv', './data/sample3.csv', './data/sample4.csv', './data/sample5.csv', './data/sample6.csv', './data/sample7.csv', './data/sample8.csv', './data/sample9.csv', './data/sample10.csv']


In [7]:
model = KNeighborsClassifier(n_neighbors=7,leaf_size=40)
params = {"neighbours":7,"leaf_size":40}

In [8]:
for i,data in enumerate(datas):
    print("Training on: ",data)
    
    df = pd.read_csv(data)
    X, y = transform_data(df,"scaler-knn")  
    
    tag = {"data":"sample"+str(i), "model": "KNN"}
    runname = "knn-test-run-" + str(datetime.datetime.now()).replace(" ","T")
    with mlflow.start_run(run_name=runname) as run:
        mlflow.set_tags(tag)                                    # Tags to help in tracking

        metrics = train_model(X, y, model)                      # Training the model
        mlflow.log_params(params)                               # Log params/hyperparameters used in experiement
        
        mlflow.log_metric("Avg CV",sum(metrics["cv_score"])/10) # Log metrics of the experiement
        mlflow.log_metric("Accuracy", metrics["accuracy"])
        
        signature = infer_signature(X, model.predict(X))
        mlflow.sklearn.log_model(model, artifact_path="models", signature=signature) # Log model created
    mlflow.end_run()
print("Training Complete.")

Training on:  ./data/sample1.csv
Scaler Not Found! Will create a new one.
Saving Scaler... {'min': array([-14.11956939, -10.66666815]), 'max': array([13.06496848,  8.44817363])}
Training on:  ./data/sample2.csv
Saving Scaler... {'min': array([-15.65758148, -12.20721314]), 'max': array([13.06496848,  8.87081208])}
Training on:  ./data/sample3.csv
Saving Scaler... {'min': array([-15.65758148, -12.20721314]), 'max': array([13.06496848, 14.50541762])}
Training on:  ./data/sample4.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample5.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample6.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample7.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}


### Test Prediction

In [9]:
testdata = pd.read_csv("./data/testdata.csv")
testdata1 = testdata[:10].copy(deep=True)
testdata1.drop(columns=["label"],axis=1,inplace=True)

In [10]:
testdata1

Unnamed: 0,Coord_X,Coord_Y
0,-12.118184,-4.776587
1,-9.893147,0.342965
2,-10.483853,-9.362718
3,-9.926457,-5.337223
4,6.855357,9.917814
5,-7.032794,-2.027453
6,-8.412141,-3.723905
7,-10.343032,-5.507503
8,-11.132471,-3.226087
9,9.028458,4.500849


In [11]:
testdata[:10]

Unnamed: 0,Coord_X,Coord_Y,label
0,-12.118184,-4.776587,Blue
1,-9.893147,0.342965,Blue
2,-10.483853,-9.362718,Blue
3,-9.926457,-5.337223,Blue
4,6.855357,9.917814,Red
5,-7.032794,-2.027453,Blue
6,-8.412141,-3.723905,Blue
7,-10.343032,-5.507503,Blue
8,-11.132471,-3.226087,Blue
9,9.028458,4.500849,Red


In [12]:
scaleddata = scale_data("scaler-knn",testdata1)

In [13]:
scaleddata

array([[0.14124111, 0.33147147],
       [0.21711604, 0.50240871],
       [0.19697265, 0.17834464],
       [0.21598018, 0.31275232],
       [0.78824895, 0.82210437],
       [0.31465563, 0.42326257],
       [0.26761915, 0.36661955],
       [0.20177475, 0.30706681],
       [0.17485446, 0.38324128],
       [0.86235285, 0.64123676]])

In [18]:
logged_model = 'runs:/ede4cc6ace71468ab2e881dc14d25f6b/models'
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model.predict(scaleddata)

array(['Blue', 'Blue', 'Blue', 'Blue', 'Red', 'Blue', 'Blue', 'Blue',
       'Blue', 'Red'], dtype=object)

---

### Training and Tracking - Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
model = RandomForestClassifier(n_estimators=250,max_depth=7)
params = {"estimators":250,"max_depth":7}

In [17]:
for i,data in enumerate(datas):
    print("Training on: ",data)
    df = pd.read_csv(data)
    X, y = transform_data(df,"scaler-rfc")  
    tag = {"data":"sample"+str(i), "model": "RandomForestClassifier", }
    with mlflow.start_run(run_name="random-forest-test-run"+str(i)):
        mlflow.set_tags(tag)
        metrics = train_model(X, y, model)
        mlflow.log_params(params)

        mlflow.log_metric("Avg CV",sum(metrics["cv_score"])/10)
        mlflow.log_metric("Accuracy", metrics["accuracy"])
        mlflow.sklearn.log_model(model, artifact_path="models")
    mlflow.end_run()

Training on:  ./data/sample1.csv
Saving Scaler... {'min': array([-14.11956939, -10.66666815]), 'max': array([13.06496848,  8.44817363])}
Training on:  ./data/sample2.csv
Saving Scaler... {'min': array([-15.65758148, -12.20721314]), 'max': array([13.06496848,  8.87081208])}
Training on:  ./data/sample3.csv
Saving Scaler... {'min': array([-15.65758148, -12.20721314]), 'max': array([13.06496848, 14.50541762])}
Training on:  ./data/sample4.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample5.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample6.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample7.csv
Saving Scaler... {'min': array([-16.26008744, -12.20721314]), 'max': array([13.06496848, 14.95510978])}
Training on:  ./data/sample8.csv
Saving S

### Test Prediction

In [None]:
testdata = pd.read_csv("./data-acc/testdata.csv")
testdata1 = testdata[30:40].copy(deep=True)
testdata1.drop(columns=["label"],axis=1,inplace=True)

In [None]:
testdata1

In [None]:
testdata[30:40]

In [None]:
scaleddata = scale_data("scaler-rfc",testdata1)

In [None]:
logged_model = 'runs:/717a5c0404fd481b9b2089914b6f14f7/models'
loaded_model = mlflow.pyfunc.load_model(logged_model)
loaded_model.predict(scaleddata)