In [1]:
from joblib import load
import numpy as np

In [2]:
# load the model
knn = load('../app/static/model/iris_knn.joblib')

In [3]:
new_data = np.array([[5.0, 3.4, 1.5, 0.2], [7.8, 3.1 , 6.2, 2.4], [7., 3.1, 5.5, 2.2]])

In [4]:
type(new_data)

numpy.ndarray

In [5]:
isinstance(new_data, np.ndarray)

True

In [6]:
one_pred = np.array([[7.8, 3.1 , 6.2, 2.4]])

In [7]:
new_data.shape

(3, 4)

In [8]:
new_data.shape[1]

4

In [9]:
one_pred.shape

(1, 4)

In [10]:
np.array([7.8, 3.1 , 6.2, 2.4]).shape == (4,)

True

In [11]:
knn.predict(one_pred)

array([2])

In [12]:
knn.predict(np.array([7.8, 3.1 , 6.2, 2.4]).reshape(1, -1))

array([2])

In [13]:
target_names = ['setosa', 'versicolor', 'virginica']

In [14]:
[target_names[i] for i in knn.predict(np.array([7.8, 3.1 , 6.2, 2.4]).reshape(1, -1))]

['virginica']

In [15]:
[target_names[i] for i in knn.predict(one_pred)]

['virginica']

In [16]:
[target_names[i] for i in knn.predict(new_data)]

['setosa', 'virginica', 'virginica']

In [17]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

In [18]:
load_dotenv(os.getenv('DOTENV'))

True

In [19]:
client = MongoClient(os.getenv('MONGO_URI'))
db = client['pocMLModelMonitoring']
col = db['iris_knn']

In [20]:
raw_predictions = knn.predict(new_data)
target_predictions = [target_names[i] for i in raw_predictions]

mongo_write_data = [
    {
        'data': list(new_data[idx]),
        'prediction': value
    } for idx, value in enumerate(target_predictions)
]

In [21]:
mongo_write_data

[{'data': [5.0, 3.4, 1.5, 0.2], 'prediction': 'setosa'},
 {'data': [7.8, 3.1, 6.2, 2.4], 'prediction': 'virginica'},
 {'data': [7.0, 3.1, 5.5, 2.2], 'prediction': 'virginica'}]

In [22]:
result = col.insert_many(mongo_write_data)
inserted_ids = result.inserted_ids

In [23]:
inserted_ids

[ObjectId('60b772c94bc6858738fedd2e'),
 ObjectId('60b772c94bc6858738fedd2f'),
 ObjectId('60b772c94bc6858738fedd30')]

In [24]:
docs = col.find({'_id': {'$in': inserted_ids}})
docs = [doc for doc in docs]

In [25]:
docs

[{'_id': ObjectId('60b772c94bc6858738fedd2e'),
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa'},
 {'_id': ObjectId('60b772c94bc6858738fedd2f'),
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica'},
 {'_id': ObjectId('60b772c94bc6858738fedd30'),
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica'}]

In [26]:
ids = ['60ad62d59ca321faf69e1aba', '60ad62d59ca321faf69e1abb', '60ad62d59ca321faf69e1abc']

In [27]:
docs = col.find({'_id': {'$in': ids}})
docs = [doc for doc in docs]

In [28]:
docs

[]

In [29]:
from bson.objectid import ObjectId

In [30]:
ids = [ObjectId(i) for i in ids]

In [31]:
docs = col.find({'_id': {'$in': ids}})
docs = [doc for doc in docs]

In [32]:
docs

[{'_id': ObjectId('60ad62d59ca321faf69e1aba'),
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa',
  'label': 'setosa',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': ObjectId('60ad62d59ca321faf69e1abb'),
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': ObjectId('60ad62d59ca321faf69e1abc'),
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)}]

In [33]:
help(docs[0]['_id'])

Help on ObjectId in module bson.objectid object:

class ObjectId(builtins.object)
 |  ObjectId(oid=None)
 |  
 |  A MongoDB ObjectId.
 |  
 |  Methods defined here:
 |  
 |  __eq__(self, other)
 |      Return self==value.
 |  
 |  __ge__(self, other)
 |      Return self>=value.
 |  
 |  __getstate__(self)
 |      return value of object for pickling.
 |      needed explicitly because __slots__() defined.
 |  
 |  __gt__(self, other)
 |      Return self>value.
 |  
 |  __hash__(self)
 |      Get a hash value for this :class:`ObjectId`.
 |  
 |  __init__(self, oid=None)
 |      Initialize a new ObjectId.
 |      
 |      An ObjectId is a 12-byte unique identifier consisting of:
 |      
 |        - a 4-byte value representing the seconds since the Unix epoch,
 |        - a 5-byte random value,
 |        - a 3-byte counter, starting with a random value.
 |      
 |      By default, ``ObjectId()`` creates a new unique identifier. The
 |      optional parameter `oid` can be an :class:`Object

In [34]:
str(docs[0]['_id'])

'60ad62d59ca321faf69e1aba'

In [35]:
for doc in docs:
    doc['_id'] = str(doc['_id'])

In [36]:
docs

[{'_id': '60ad62d59ca321faf69e1aba',
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa',
  'label': 'setosa',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': '60ad62d59ca321faf69e1abb',
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': '60ad62d59ca321faf69e1abc',
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)}]

In [37]:
from copy import deepcopy

In [38]:
data = deepcopy(docs)

In [39]:
data

[{'_id': '60ad62d59ca321faf69e1aba',
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa',
  'label': 'setosa',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': '60ad62d59ca321faf69e1abb',
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': '60ad62d59ca321faf69e1abc',
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica',
  'label': 'virginica',
  'prediction_timestamp': datetime.datetime(2020, 1, 1, 0, 0)}]

In [40]:
data = [{'_id': i['_id']} for i in data]

In [41]:
data

[{'_id': '60ad62d59ca321faf69e1aba'},
 {'_id': '60ad62d59ca321faf69e1abb'},
 {'_id': '60ad62d59ca321faf69e1abc'}]

In [42]:
data[0]['label'] = 'setosa'

In [43]:
data[1]['label'] = 'virginica'

In [44]:
data[2]['label'] = 'virginica'

In [45]:
data

[{'_id': '60ad62d59ca321faf69e1aba', 'label': 'setosa'},
 {'_id': '60ad62d59ca321faf69e1abb', 'label': 'virginica'},
 {'_id': '60ad62d59ca321faf69e1abc', 'label': 'virginica'}]

In [46]:
updated_ids = []
invalid_ids = []

In [47]:
for record in data:
    result = col.update_one({'_id': ObjectId(record['_id'])}, {'$set': {'label': record['label']}})
    if result.matched_count != 1:
        invalid_ids.append(record['_id'])
    else:
        updated_ids.append(record['_id'])

In [48]:
if invalid_ids:
    label_result = {'result': f'only {len(updated_ids)} labeled. invalid ids supplied: {[i for i in invalid_ids]}'}
else:
    label_result = {'result': f'{len(updated_ids)} records labeled'}

In [49]:
label_result

{'result': '3 records labeled'}

# Simulate Data and Predict

In [50]:
from sklearn import datasets
import pandas as pd

In [51]:
iris = datasets.load_iris()
data = iris['data']
target = iris['target']
target_names = iris['target_names']
feature_names = iris['feature_names']

In [52]:
df = pd.DataFrame(data, columns=feature_names)
df['species'] = pd.Series(target).apply(lambda x: target_names[x])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [53]:
def simulate_iris_data(df):
    """Take a dataframe of data from the iris dataset. Use labeled data to simulate additional data."""
    
    sim_data = pd.DataFrame(columns=feature_names+['species'])
    
    for species in target_names:
        dff = deepcopy(df[df['species'] == species])
        sepal_length_mean = dff['sepal length (cm)'].mean()
        sepal_width_mean = dff['sepal width (cm)'].mean()
        petal_length_mean = dff['petal length (cm)'].mean()
        petal_width_mean = dff['petal width (cm)'].mean()
        
        sepal_length_std = dff['sepal length (cm)'].std()
        sepal_width_std = dff['sepal width (cm)'].std()
        petal_length_std = dff['petal length (cm)'].std()
        petal_width_std = dff['petal width (cm)'].std()
        
        sepal_length_dist = np.random.normal(sepal_length_mean, sepal_length_std, 10000)
        sepal_width_dist = np.random.normal(sepal_width_mean, sepal_width_std, 10000)
        petal_length_dist = np.random.normal(petal_length_mean, petal_length_std, 10000)
        petal_width_dist = np.random.normal(petal_width_mean, petal_width_std, 10000)
        
        tdf = pd.DataFrame(
            {
                'sepal length (cm)': sepal_length_dist,
                'sepal width (cm)': sepal_width_dist,
                'petal length (cm)': petal_length_dist,
                'petal width (cm)': petal_width_dist
            }
        )
        
        tdf['species'] = species
        
        sim_data = pd.concat([sim_data, tdf], ignore_index=True)
        
    return sim_data

In [54]:
sim_data = simulate_iris_data(df)

In [55]:
sim_data.shape

(30000, 5)

In [56]:
sim_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.194198,3.215587,1.377881,0.317558,setosa
1,4.631614,3.27797,1.621401,0.261498,setosa
2,5.225635,2.906936,1.662291,0.081985,setosa
3,5.117965,3.314229,1.626484,0.303116,setosa
4,5.700167,3.055684,1.587151,0.32975,setosa


In [57]:
import plotly.graph_objects as go

In [58]:
fig = go.Figure()

for species in target_names:
    tdf = sim_data[sim_data['species'] == species]
    fig.add_trace(
        go.Scatter(
            x=tdf['sepal length (cm)'],
            y=tdf['petal width (cm)'],
            mode='markers',
            name=species
        )
    )

fig.show()

In [59]:
sim_data.loc[:, feature_names].head().to_numpy()

array([[5.19419758, 3.2155865 , 1.37788096, 0.31755802],
       [4.63161411, 3.27796963, 1.62140058, 0.26149811],
       [5.22563543, 2.90693649, 1.66229126, 0.0819846 ],
       [5.11796508, 3.31422933, 1.62648425, 0.30311619],
       [5.7001673 , 3.05568362, 1.58715079, 0.32975005]])

In [60]:
sim_data.head().to_numpy()

array([[5.1941975787022905, 3.2155865005845117, 1.3778809560307121,
        0.31755802026777, 'setosa'],
       [4.6316141090193845, 3.2779696250847787, 1.6214005811013126,
        0.2614981052052524, 'setosa'],
       [5.225635434667655, 2.906936491993764, 1.6622912567021029,
        0.08198460173621322, 'setosa'],
       [5.117965080644364, 3.3142293266902643, 1.6264842481007853,
        0.3031161927024961, 'setosa'],
       [5.700167304754544, 3.055683621041775, 1.5871507850996553,
        0.3297500477349448, 'setosa']], dtype=object)

In [61]:
def generate_predictions(df):
    """This function will add the predictions column to an iris dataframe as long
    as the column names are correct for the input df."""
    
    data = sim_data.loc[:, feature_names].to_numpy()
    df['prediction'] = [target_names[value] for value in knn.predict(data)]
    
    return df

In [62]:
sim_data = generate_predictions(sim_data)

In [63]:
sim_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,prediction
0,5.194198,3.215587,1.377881,0.317558,setosa,setosa
1,4.631614,3.27797,1.621401,0.261498,setosa,setosa
2,5.225635,2.906936,1.662291,0.081985,setosa,setosa
3,5.117965,3.314229,1.626484,0.303116,setosa,setosa
4,5.700167,3.055684,1.587151,0.32975,setosa,setosa


In [72]:
from datetime import datetime as dt

In [71]:
num_rows = sim_data.shape[0]

In [74]:
load_data = [{'data': list(sim_data.loc[i, :][:4].values), 'prediction': sim_data.loc[i, 'species'], 'label': sim_data.loc[i, 'prediction'], 'timestamp': dt.now()} for i in range(num_rows)]

In [75]:
load_data[0]

{'data': [5.1941975787022905,
  3.2155865005845117,
  1.3778809560307121,
  0.31755802026777],
 'prediction': 'setosa',
 'label': 'setosa',
 'timestamp': datetime.datetime(2021, 6, 2, 8, 6, 35, 130551)}

In [77]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os

In [78]:
load_dotenv(os.getenv('DOTENV'))

True

In [79]:
client = MongoClient(os.getenv('MONGO_URI'))
db = client['pocMLModelMonitoring']
col = db['iris_knn']

In [80]:
result = col.insert_many(load_data)
len(result.inserted_ids)

30000

In [85]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score

In [111]:
class_report = classification_report(sim_data['species'], sim_data['prediction'])
print(class_report)

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00     10000
  versicolor       0.94      0.97      0.96     10000
   virginica       0.97      0.94      0.95     10000

    accuracy                           0.97     30000
   macro avg       0.97      0.97      0.97     30000
weighted avg       0.97      0.97      0.97     30000



In [130]:
conf_matrix = confusion_matrix(sim_data['species'], sim_data['prediction'], labels=['setosa', 'versicolor', 'virginica'])
print(conf_matrix)

[[10000     0     0]
 [    1  9674   325]
 [    0   575  9425]]


In [88]:
help(knn)

Help on KNeighborsClassifier in module sklearn.neighbors._classification object:

class KNeighborsClassifier(sklearn.neighbors._base.KNeighborsMixin, sklearn.base.ClassifierMixin, sklearn.neighbors._base.NeighborsBase)
 |  KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)
 |  
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, default=5
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |  
 |  weights : {'uniform', 'distance'} or callable, default='uniform'
 |      weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of their distance.
 |        in this case, closer n

In [100]:
target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [102]:
conf_matrix

array([[10000,     0,     0],
       [    1,  9674,   325],
       [    0,   575,  9425]], dtype=int64)

In [114]:
conf_matrix[:,0]

array([10000,     1,     0], dtype=int64)

In [108]:
tpr = {}
fpr = {}
tpr['setosa'] = conf_matrix[0][0]/np.sum(conf_matrix[0])
tpr['versicolor'] = conf_matrix[1][1]/np.sum(conf_matrix[1])
tpr['virginica'] = conf_matrix[2][2]/np.sum(conf_matrix[2])
fpr['setosa'] = np.sum(conf_matrix[0][1:])/np.sum(conf_matrix[0])
fpr['versicolor'] = (np.sum(conf_matrix[1]) - conf_matrix[1][1])/np.sum(conf_matrix[1])
fpr['virginica'] = np.sum(conf_matrix[2][:2])/np.sum(conf_matrix[2])

In [109]:
tpr

{'setosa': 1.0, 'versicolor': 0.9674, 'virginica': 0.9425}

In [110]:
fpr

{'setosa': 0.0, 'versicolor': 0.0326, 'virginica': 0.0575}

In [115]:
# precision, recall, and f1 scores
precision = {}
recall = {}
f1 = {}
precision['setosa'] = conf_matrix[0][0]/np.sum(conf_matrix[:,0])
precision['versicolor'] = conf_matrix[1][1]/np.sum(conf_matrix[:,1])
precision['virginica'] = conf_matrix[2][2]/np.sum(conf_matrix[:,2])
recall['setosa'] = tpr['setosa']
recall['versicolor'] = tpr['versicolor']
recall['virginica'] = tpr['virginica']
f1['setosa'] = 2*(precision['setosa'] * recall['setosa'])/(precision['setosa'] + recall['setosa'])
f1['versicolor'] = 2*(precision['versicolor'] * recall['versicolor'])/(precision['versicolor'] + recall['versicolor'])
f1['virginica'] = 2*(precision['virginica'] * recall['virginica'])/(precision['virginica'] + recall['virginica'])

In [116]:
precision

{'setosa': 0.9999000099990001,
 'versicolor': 0.9438969655576154,
 'virginica': 0.9666666666666667}

In [117]:
recall

{'setosa': 1.0, 'versicolor': 0.9674, 'virginica': 0.9425}

In [118]:
f1

{'setosa': 0.999950002499875,
 'versicolor': 0.9555039755049634,
 'virginica': 0.9544303797468354}

In [121]:
# accuracy
accuracy = np.sum([conf_matrix[i][i] for i in range(conf_matrix.shape[0])])/np.sum(conf_matrix)

In [122]:
accuracy

0.9699666666666666

In [123]:
sample_size = np.sum(conf_matrix)
sample_size

30000

In [127]:
class_balance = {}
for species in target_names:
    class_balance[species] = sim_data[sim_data['species'] == species]['species'].count()/sim_data['species'].count()

In [128]:
class_balance

{'setosa': 0.3333333333333333,
 'versicolor': 0.3333333333333333,
 'virginica': 0.3333333333333333}

In [132]:
def get_mongo_collection(database, collection):
    """Gets and returns a mongo collection."""

    client = MongoClient(os.getenv('MONGO_URI'))
    db = client[database]
    col = db[collection]
    return col

In [131]:
from datetime import datetime as dt

In [144]:
# put it all into a function
def load_model_performance(df):
    """This function generates model performance metrics. It assumes a dataframe is supplied with feature names,
    species (actual label) and prediction (predicted label).
    
    CONSIDER REQUIRING THE MONGO _id FOR ALL OBSERVATIONS, AND RECORDING THAT LIST IN THE MODEL PERFORMANCE
    IF SO, ONLY INCLUDE _id VALUES THAT HAVEN'T ALREADY BEEN ANALYZED
    
    ALSO LOAD CONFUSION MATRIX ALONE INTO SEPARATE COLLECTION, ALLOWING FOR AGGREGATE METRIC EVALUATION
    
    RETHINKING THIS:
    PULL DATA FROM MONGO TO DO THE METRICS
    USE THE 'prediction_timestamp' FIELD OR THE NUMBER OF DOCS (sorted by prediction_timestamp) TO GENERATE
    PERFORMANCE METRICS
    ONLY PULL RECORDS THAT HAVE LABELS FOR THE METRICS
    EITHER USE PYTHON DEPLOYED ON APPFILE OR USE PDI FOR THIS, LEANING TOWARDS PYTHON ON APPFILE, EASIER TO CODE
    
    Column names should be ["sepal length (cm)", "sepal width (cm)", "petal length (cm)",
    "petal width (cm)", "species", "prediction"].
    
    species and prediction values should be ["setosa", "versicolor", "virginica"]."""
    
    # calculate the confusion matrix
    # adding the labels ensures the correct index order for the species
    conf_matrix = confusion_matrix(df['species'], df['prediction'], labels=['setosa', 'versicolor', 'virginica'])
    
    # model accuracy
    accuracy = np.sum([conf_matrix[i][i] for i in range(conf_matrix.shape[0])])/np.sum(conf_matrix)
    
    # precision, recall, and f1 scores
    precision = {}
    recall = {}
    f1 = {}
    precision['setosa'] = conf_matrix[0][0]/np.sum(conf_matrix[:,0])
    precision['versicolor'] = conf_matrix[1][1]/np.sum(conf_matrix[:,1])
    precision['virginica'] = conf_matrix[2][2]/np.sum(conf_matrix[:,2])
    recall['setosa'] = tpr['setosa']
    recall['versicolor'] = tpr['versicolor']
    recall['virginica'] = tpr['virginica']
    f1['setosa'] = 2*(precision['setosa'] * recall['setosa'])/(precision['setosa'] + recall['setosa'])
    f1['versicolor'] = 2*(precision['versicolor'] * recall['versicolor'])/(precision['versicolor'] + recall['versicolor'])
    f1['virginica'] = 2*(precision['virginica'] * recall['virginica'])/(precision['virginica'] + recall['virginica'])
    
    sample_size = int(np.sum(conf_matrix))
    
    class_balance = {}
    for species in target_names:
        class_balance[species] = sim_data[sim_data['species'] == species]['species'].count()/sim_data['species'].count()
        
    # get the mongo collection
    col = get_mongo_collection(database='pocMLModelMonitoring', collection='irisKnnPerformance')
    
    # synthesize the observation    
    document = {}
    document['timestamp'] = dt.now()
    document['model_accuracy'] = accuracy
    document['precision'] = precision
    document['recall'] = recall
    document['F1'] = f1
    document['class_balance'] = class_balance
    document['sample_size'] = sample_size
    
    # write the data
    result = col.insert_one(document)
    
    # return the result object
    return result

In [145]:
result = load_model_performance(sim_data)
print(result.inserted_id)

60ae7f5891302ff675141df2


In [146]:
col = get_mongo_collection(database='pocMLModelMonitoring', collection='irisKnnPerformance')
doc = col.find_one({'sample_size': 40000})
doc

In [150]:
pred_col = get_mongo_collection(database='pocMLModelMonitoring', collection='iris_knn')
pred_docs = pred_col.find({'label': {'$exists': True, '$ne': None}}, {'prediction': 1, 'label': 1})
pred_docs = [doc for doc in pred_docs]

In [152]:
pd.DataFrame(pred_docs)

Unnamed: 0,_id,prediction,label
0,60ad62d59ca321faf69e1aba,setosa,setosa
1,60ad62d59ca321faf69e1abb,virginica,virginica
2,60ad62d59ca321faf69e1abc,virginica,virginica


In [177]:
import pymongo

In [149]:
from datetime import date
from datetime import datetime as dt

In [153]:
import math

In [197]:
# edit the function to query labeled records from mongo for performance metrics
def get_model_performance(mode='all', start_date=None, end_date=None, pct=None, count=None):
    """Query labeled data only from the iris_knn predictions collection in MongoDB. 
    
    mode: str: either 'all', 'dates', 'pct', or 'count'. if mode=='dates', specify a date range using start_date
               and end date.
    start_date: datetime: required if mode=='dates' else not used (inclusive of start_date)
    end_date: datetime: required if mode=='dates' else not used (inclusive of end_date)
    pct: float: decimal percent for the percent of labeled records to use for model performance metrics
    count: int: number of labeled documents to use for model performance metrics
    """
    
    if mode == 'all':
        params = {}
        criteria = 'all records'
    elif mode == 'dates':
        if not start_date or not end_date:
            raise ValueError('start_date and end_date are required when mode="dates"')
        if not isinstance(start_date, str) or not isinstance(end_date, str):
            raise TypeError('start_date and end_date must be supplied as strings in format "YYYY-MM-DD"')
        start_date = dt.strptime(start_date, '%Y-%m-%d')
        end_date = dt.strptime(end_date, '%Y-%m-%d')
        params = {
            '$and': 
            [
                {'prediction_timestamp': {'$gte': start_date}}, 
                {'prediction_timestamp': {'$lte': end_date}}
            ]
        }
        criteria = {'start_date': start_date, 'end_date': end_date}
    elif mode == 'pct':
        if not pct:
            raise ValueError('pct is required when mode="pct"')
        if not isinstance(pct, int) and not isinstance(pct, float):
            raise TypeError('pct must be a decimal number between 0-1 (30% should be supplied as 0.3)')
        if 1 < pct < 0:
            raise ValueError('pct must be a decimal number between 0-1 (30% should be supplied as 0.3)')
        params = {}
        criteria = {'pct': pct}
    elif mode == 'count':
        if not count:
            raise ValueError('count is required when mode="count"')
        if not isinstance(count, int):
            raise TypeError('count must be supplied as an integer')
        params = {}
        criteria = {'count': count}
    else:
        raise ValueError('mode must be either "all", "dates", "pct", or "count"')
        
    params['label'] = {'$exists': True, '$ne': None}
    
    pred_col = get_mongo_collection(database='pocMLModelMonitoring', collection='iris_knn')
    
    pred_docs = pred_col.find(params, {'prediction': 1, 'label': 1}).sort('prediction_timestamp', pymongo.DESCENDING)
    pred_docs = [doc for doc in pred_docs]
    
    if mode == 'pct':
        num_docs = math.ceil(len(pred_docs) * pct)
        pred_docs = pred_docs[:num_docs]
    elif mode == 'count':
        if count < len(pred_docs):
            pred_docs = pred_docs[:count]
        else:
            # if count is greater than the number of docs, include this in the criteria
            criteria = {'count': count, 'found': len(pred_docs)}
    
    if not pred_docs:
        # exit the function if nothing is found
        return
    
    dff = pd.DataFrame(pred_docs)
    
    target_names = ['setosa', 'versicolor', 'virginica']
    
    # calculate the confusion matrix
    # adding the labels ensures the correct index order for the species
    conf_matrix = confusion_matrix(dff['label'], dff['prediction'], labels=target_names)
    
    # model accuracy
    accuracy = np.sum([conf_matrix[i][i] for i in range(conf_matrix.shape[0])])/np.sum(conf_matrix)
    
    # precision, recall, and f1 scores
    precision = {}
    recall = {}
    f1 = {}
    
    targets_found = sorted(list(dff['label'].unique()))
    
    if 'setosa' in targets_found:
        precision['setosa'] = conf_matrix[0][0]/np.sum(conf_matrix[:,0])
        recall['setosa'] = conf_matrix[0][0]/np.sum(conf_matrix[0])
        f1['setosa'] = 2*(precision['setosa'] * recall['setosa'])/(precision['setosa'] + recall['setosa'])
    else:
        precision['setosa'] = None
        recall['setosa'] = None
        f1['setosa'] = None
        
    if 'versicolor' in targets_found:
        precision['versicolor'] = conf_matrix[1][1]/np.sum(conf_matrix[:,1])
        recall['versicolor'] = conf_matrix[1][1]/np.sum(conf_matrix[1])
        f1['versicolor'] = 2*(precision['versicolor'] * recall['versicolor'])/(precision['versicolor'] + recall['versicolor'])
    else:
        precision['versicolor'] = None
        recall['versicolor'] = None
        f1['versicolor'] = None
    
    if 'virginica' in targets_found:
        precision['virginica'] = conf_matrix[2][2]/np.sum(conf_matrix[:,2])
        recall['virginica'] = conf_matrix[2][2]/np.sum(conf_matrix[2])
        f1['virginica'] = 2*(precision['virginica'] * recall['virginica'])/(precision['virginica'] + recall['virginica'])
    else:
        precision['virginica'] = None
        recall['virginica'] = None
        f1['virginica'] = None
    
    sample_size = int(np.sum(conf_matrix))
    
    class_balance = {}
    for species in target_names:
        if species not in targets_found:
            class_balance[species] = 0.0
        else:
            class_balance[species] = dff['label'].value_counts()[species]/dff.shape[0]
        
#     # get the mongo collection
#     col = get_mongo_collection(database='pocMLModelMonitoring', collection='irisKnnPerformance')
    
    # synthesize the observation    
    document = {}
    document['performance_timestamp'] = dt.now()
    document['measurement_mode'] = mode
    document['query_criteria'] = criteria
    document['model_accuracy'] = accuracy
    document['precision'] = precision
    document['recall'] = recall
    document['F1'] = f1
    document['class_balance'] = class_balance
    document['sample_size'] = sample_size
    
#     # write the data
#     result = col.insert_one(document)
    
    # return the result object
    # return result
    
    return document

In [198]:
document = get_model_performance(mode='all', start_date=None, end_date=None, pct=None, count=None)
document

{'performance_timestamp': datetime.datetime(2021, 5, 26, 16, 46, 32, 97315),
 'measurement_mode': 'all',
 'query_criteria': 'all records',
 'model_accuracy': 1.0,
 'precision': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'recall': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'F1': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'class_balance': {'setosa': 0.3333333333333333,
  'versicolor': 0.0,
  'virginica': 0.6666666666666666},
 'sample_size': 6}

In [199]:
document = get_model_performance(mode='pct', start_date=None, end_date=None, pct=0.8, count=None)
document

{'performance_timestamp': datetime.datetime(2021, 5, 26, 16, 46, 32, 443314),
 'measurement_mode': 'pct',
 'query_criteria': {'pct': 0.8},
 'model_accuracy': 1.0,
 'precision': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'recall': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'F1': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'class_balance': {'setosa': 0.4, 'versicolor': 0.0, 'virginica': 0.6},
 'sample_size': 5}

In [201]:
document = get_model_performance(mode='count', start_date=None, end_date=None, pct=None, count=5)
document

{'performance_timestamp': datetime.datetime(2021, 5, 26, 16, 46, 42, 12318),
 'measurement_mode': 'count',
 'query_criteria': {'count': 5},
 'model_accuracy': 1.0,
 'precision': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'recall': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'F1': {'setosa': 1.0, 'versicolor': None, 'virginica': 1.0},
 'class_balance': {'setosa': 0.4, 'versicolor': 0.0, 'virginica': 0.6},
 'sample_size': 5}