# Edit Prediction Dates
- This notebook's purpose is to take the predictions loaded into Mongo and spread out the date ranges for testing the model performance metrics with different options

In [1]:
import pymongo
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta as td
import os
from dotenv import load_dotenv

In [2]:
load_dotenv(os.getenv('DOTENV'))

True

In [3]:
client = pymongo.MongoClient(os.getenv('MONGO_URI'))
db = client['pocMLModelMonitoring']
col = db['iris_knn']

In [4]:
docs = col.find({}).sort('prediction_timestamp', pymongo.ASCENDING)
docs = [doc for doc in docs]
len(docs)

30027

In [5]:
docs[0]

{'_id': ObjectId('60b772c94bc6858738fedd2e'),
 'data': [5.0, 3.4, 1.5, 0.2],
 'prediction': 'setosa'}

In [6]:
docs[-1]

{'_id': ObjectId('60b14dc372051a271b6e28d4'),
 'data': [7.0, 3.1, 5.5, 2.2],
 'prediction': 'virginica',
 'prediction_timestamp': datetime.datetime(2021, 5, 28, 16, 8, 35, 94000)}

In [11]:
no_timestamp = [doc for doc in docs if not doc.get('timestamp')]

In [12]:
len(no_timestamp)

27

In [13]:
no_timestamp[0]

{'_id': ObjectId('60b772c94bc6858738fedd2e'),
 'data': [5.0, 3.4, 1.5, 0.2],
 'prediction': 'setosa'}

In [14]:
no_timestamp[-1]

{'_id': ObjectId('60b14dc372051a271b6e28d4'),
 'data': [7.0, 3.1, 5.5, 2.2],
 'prediction': 'virginica',
 'prediction_timestamp': datetime.datetime(2021, 5, 28, 16, 8, 35, 94000)}

In [15]:
for doc in no_timestamp:
    if 'prediction_timestamp' in doc.keys():
        ts = doc['prediction_timestamp']
        del(doc['prediction_timestamp'])
        doc['timestamp'] = ts
    else:
        doc['timestamp'] = dt.now()

In [16]:
no_timestamp

[{'_id': ObjectId('60b772c94bc6858738fedd2e'),
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa',
  'timestamp': datetime.datetime(2021, 6, 8, 8, 21, 36, 432738)},
 {'_id': ObjectId('60b772c94bc6858738fedd2f'),
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica',
  'timestamp': datetime.datetime(2021, 6, 8, 8, 21, 36, 432738)},
 {'_id': ObjectId('60b772c94bc6858738fedd30'),
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica',
  'timestamp': datetime.datetime(2021, 6, 8, 8, 21, 36, 432738)},
 {'_id': ObjectId('60ad62d59ca321faf69e1aba'),
  'data': [5.0, 3.4, 1.5, 0.2],
  'prediction': 'setosa',
  'label': 'setosa',
  'timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': ObjectId('60ad62d59ca321faf69e1abb'),
  'data': [7.8, 3.1, 6.2, 2.4],
  'prediction': 'virginica',
  'label': 'virginica',
  'timestamp': datetime.datetime(2020, 1, 1, 0, 0)},
 {'_id': ObjectId('60ad62d59ca321faf69e1abc'),
  'data': [7.0, 3.1, 5.5, 2.2],
  'prediction': 'virginica',
  'label'

In [19]:
modified = 0
for doc in no_timestamp:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'timestamp': doc['timestamp']}})
    modified += result.modified_count
modified

27

In [20]:
result = col.update_many({'prediction_timestamp': {'$exists': 1}}, {'$unset': {'prediction_timestamp': ''}})
result.modified_count

24

In [22]:
docs = col.find({'timestamp': {'$exists': False}})
docs = [doc for doc in docs]
len(docs)

0

In [23]:
import numpy as np

In [33]:
spread = np.arange(start=0, stop=(365*5), dtype=np.dtype(int))

In [34]:
spread

array([   0,    1,    2, ..., 1822, 1823, 1824])

In [35]:
arr = np.random.choice(spread, size=30027, replace=True)

In [36]:
arr.dtype

dtype('int32')

In [37]:
start = dt.now()

In [38]:
arr = start - arr * td(days=1)

In [41]:
arr.sort()

In [42]:
arr

array([datetime.datetime(2016, 6, 10, 8, 42, 28, 358475),
       datetime.datetime(2016, 6, 10, 8, 42, 28, 358475),
       datetime.datetime(2016, 6, 10, 8, 42, 28, 358475), ...,
       datetime.datetime(2021, 6, 8, 8, 42, 28, 358475),
       datetime.datetime(2021, 6, 8, 8, 42, 28, 358475),
       datetime.datetime(2021, 6, 8, 8, 42, 28, 358475)], dtype=object)

In [43]:
len(arr)

30027

In [44]:
docs = col.find({})
docs = [doc for doc in docs]
len(docs)

30027

In [45]:
modified = 0
for idx, doc in enumerate(docs):
    result = col.update_one({'_id': doc['_id']}, {'$set': {'timestamp': arr[idx]}})
    modified += result.modified_count
modified

30027

In [46]:
query_date = '2021-01-01'
query_date = dt.strptime(query_date, '%Y-%m-%d')

In [49]:
edit_docs = col.find({'timestamp': {'$gte': query_date}})
edit_docs = [doc for doc in edit_docs]
len(edit_docs)

2697

In [50]:
edit_docs[0]

{'_id': ObjectId('60b774ee4bc6858738ff47d9'),
 'data': [4.953994340401033,
  2.381260105023531,
  5.245934967068404,
  2.374900405078157],
 'prediction': 'virginica',
 'label': 'virginica',
 'timestamp': datetime.datetime(2021, 1, 1, 8, 42, 28, 358000)}

In [51]:
edit_docs[-1]

{'_id': ObjectId('60b774ee4bc6858738ff5261'),
 'data': [7.009314553501851,
  2.665991758622333,
  5.060582608452697,
  1.9850569105876488],
 'prediction': 'virginica',
 'label': 'virginica',
 'timestamp': datetime.datetime(2021, 6, 8, 8, 42, 28, 358000)}

In [53]:
for doc in edit_docs:
    if 'label' in doc.keys():
        data = doc['data']
        if doc['label'] == 'virginica':
            for idx, val in enumerate(data):
                data[idx] += abs(np.random.normal(data[idx], (data[idx]*0.25), 1))
        elif doc['label'] == 'versicolor':
            for idx, val in enumerate(data):
                data[idx] -=  abs(np.random.normal(data[idx], (data[idx]*0.25), 1))
        else:
            for idx, val in enumerate(data):
                data[idx] += abs(np.random.normal(data[idx], (data[idx]*0.75), 1))
        try:
            del(doc['prediction'])
        except:
            pass
    else:
        continue

In [54]:
edit_docs[-1]

{'_id': ObjectId('60b774ee4bc6858738ff5261'),
 'data': [array([13.31436574]),
  array([4.78882988]),
  array([11.68193805]),
  array([3.51173616])],
 'label': 'virginica',
 'timestamp': datetime.datetime(2021, 6, 8, 8, 42, 28, 358000)}

In [58]:
for doc in edit_docs:
    doc['data'] = [float(i[0]) for i in doc['data']]

In [59]:
edit_docs[-1]

{'_id': ObjectId('60b774ee4bc6858738ff5261'),
 'data': [13.314365743453312,
  4.788829879228439,
  11.68193805412978,
  3.5117361552696185],
 'label': 'virginica',
 'timestamp': datetime.datetime(2021, 6, 8, 8, 42, 28, 358000)}

In [60]:
modified = 0
for doc in edit_docs:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'data': list(doc['data'])}, '$unset': {'prediction': ''}})
    modified += result.modified_count
modified

2697

In [61]:
from joblib import load

In [62]:
knn = load('C:/users/CRTUCKER/documents/notes_and_resources/python_model_deployment/app/static/model/iris_knn.joblib')

In [67]:
np.array(edit_docs[0]['data']).shape

(4,)

In [72]:
knn.predict(np.array(edit_docs[0]['data']).reshape(1, -1))[0]

2

In [70]:
target_names = ['setosa', 'versicolor', 'virginica']

In [73]:
for doc in edit_docs:
    pred = knn.predict(np.array(doc['data']).reshape(1, -1))
    doc['prediction'] = target_names[pred[0]]

In [75]:
edit_docs[-1]

{'_id': ObjectId('60b774ee4bc6858738ff5261'),
 'data': [13.314365743453312,
  4.788829879228439,
  11.68193805412978,
  3.5117361552696185],
 'label': 'virginica',
 'timestamp': datetime.datetime(2021, 6, 8, 8, 42, 28, 358000),
 'prediction': 'virginica'}

In [76]:
modified = 0
for doc in edit_docs:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'prediction': doc['prediction']}})
    modified += result.modified_count
modified

2697

## Need to redistribute (evenly) predictions based on dates
- Modify Setosa's after 1-1-2021
- Modify a few more Versicolors after 1-1-2021

In [88]:
setosas = col.find({'label': 'setosa'})
setosas = [doc for doc in setosas]

In [89]:
len(setosas)

10003

In [90]:
max([i['timestamp'] for i in setosas])

datetime.datetime(2018, 2, 9, 8, 42, 28, 358000)

In [93]:
timestamps = np.array([i['timestamp'] for i in setosas])

In [98]:
diffs = dt.now() - timestamps
min_diff = diffs.min()

In [100]:
min_diff.days

1215

In [101]:
1 % 2

1

In [102]:
4 % 2

0

In [103]:
def is_odd(x):
    rem = x % 2
    if rem == 1:
        return True

In [107]:
days = [(i.days - min_diff.days) if is_odd(i.days) else 0 for i in list(diffs)]

In [119]:
days

[0,
 0,
 0,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 608,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 606,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 604,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 602,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 600,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598,
 598

In [122]:
for idx, delta in enumerate(days):
    timestamps[idx] += td(days=days[idx])

In [123]:
timestamps.max()

datetime.datetime(2021, 6, 9, 8, 42, 28, 358000)

In [129]:
len([i for i in timestamps if i.year==2020])

1453

In [130]:
for idx, doc in enumerate(setosas):
    doc['timestamp'] = timestamps[idx]

In [131]:
modified = 0
for doc in setosas:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'timestamp': doc['timestamp']}})
    modified += result.modified_count
modified

4931

In [138]:
to_edit = col.find({'label': 'setosa', 'timestamp': {'$gte': dt.strptime('2021-01-01', '%Y-%m-%d')}})
to_edit = [doc for doc in to_edit]

In [139]:
len(to_edit)

656

In [141]:
for doc in to_edit:
    data = doc['data']
    for idx, val in enumerate(data):
        data[idx] += abs(np.random.normal(abs(data[idx]), abs(data[idx]*0.75), 1))
    try:
        del(doc['prediction'])
    except:
        pass

In [143]:
for doc in to_edit:
    doc['data'] = [i[0] for i in doc['data']]

In [144]:
to_edit[0]

{'_id': ObjectId('60b774ee4bc6858738fedd32'),
 'data': [11.882539136048447,
  9.955657080987251,
  4.657151484641796,
  1.020305552407145],
 'label': 'setosa',
 'timestamp': datetime.datetime(2021, 6, 9, 8, 42, 28, 358000)}

In [145]:
modified = 0
for doc in to_edit:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'data': doc['data']}})
    modified += result.modified_count
modified

656

In [146]:
virgs = col.find({'label': 'virginica'})
virgs = [doc for doc in virgs]
len(virgs)

9723

In [147]:
timestamps = [i['timestamp'] for i in virgs]

In [149]:
timestamps = np.array(timestamps)

In [150]:
timestamps.max()

datetime.datetime(2021, 6, 8, 8, 42, 28, 358000)

In [151]:
timestamps.min()

datetime.datetime(2016, 6, 10, 8, 42, 28, 358000)

In [158]:
9723/2

4861.5

In [159]:
timestamps[4861]

datetime.datetime(2020, 8, 4, 8, 42, 28, 358000)

In [160]:
timestamps.max() - timestamps.min()

datetime.timedelta(days=1824)

In [161]:
1824/2

912.0

In [162]:
samples = np.arange(0, 1824, dtype=int, step=1)

In [164]:
adjustment = np.random.choice(samples, size=9723, replace=True)

In [172]:
adjustment.sort()
adjustment

array([   0,    0,    0, ..., 1823, 1823, 1823])

In [174]:
baseline = timestamps.min()
for idx, doc in enumerate(virgs):
    doc['timestamp'] = baseline + td(days=int(adjustment[idx]))

In [175]:
modified = 0
for doc in virgs:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'timestamp': doc['timestamp']}})
    modified += result.modified_count
modified

9717

In [176]:
vers = col.find({'label': 'versicolor'})
vers = [doc for doc in vers]
len(vers)

10283

In [178]:
timestamps = np.array([i['timestamp'] for i in vers])

In [179]:
timestamps.min()

datetime.datetime(2018, 2, 9, 8, 42, 28, 358000)

In [180]:
timestamps.max()

datetime.datetime(2021, 6, 8, 8, 42, 28, 358000)

In [181]:
baseline

datetime.datetime(2016, 6, 10, 8, 42, 28, 358000)

In [182]:
adjustment = np.random.choice(samples, size=10283, replace=True)

In [183]:
for idx, doc in enumerate(vers):
    doc['timestamp'] = baseline + td(days=int(adjustment[idx]))

In [184]:
modified = 0
for doc in vers:
    result = col.update_one({'_id': doc['_id']}, {'$set': {'timestamp': doc['timestamp']}})
    modified += result.modified_count
modified

10276

In [77]:
from sklearn.metrics import confusion_matrix
import math

In [78]:
# custom date string format exception
class IncorrectDateFormat(ValueError):
    def __init__(self, date_string):
        self.date_string = date_string
        self.message = f'date string {date_string} must be in format "YYYY-MM-DD"'
        super().__init__(self.message)

In [196]:
def get_model_performance(collection, mode='all', start_date=None, end_date=None, pct=None, count=None):
    """Query labeled data only from the iris_knn predictions collection in MongoDB.

    collection: Mongo collection: the mongo collection with the model performance metrics
    mode: str: either 'all', 'dates', 'pct', or 'count'. if mode=='dates', specify a date range using start_date
               and end date.
    start_date: datetime: required if mode=='dates' else not used (inclusive of start_date)
    end_date: datetime: required if mode=='dates' else not used (exclusive of end_date)
    pct: float: decimal percent for the percent of labeled records to use for model performance metrics
    count: int: number of labeled documents to use for model performance metrics
    """

    if mode == 'all':
        params = {}
        criteria = 'all records'
    elif mode == 'dates':
        if not start_date or not end_date:
            raise ValueError('start_date and end_date are required when mode="dates"')
        if not isinstance(start_date, str) or not isinstance(end_date, str):
            raise TypeError('start_date and end_date must be supplied as strings in format "YYYY-MM-DD"')
        try:
            start_date = dt.strptime(start_date, '%Y-%m-%d')
            end_date = dt.strptime(end_date, '%Y-%m-%d')
        except ValueError:
            raise IncorrectDateFormat

        params = {
            '$and':
                [
                    {'timestamp': {'$gte': start_date}},
                    {'timestamp': {'$lt': end_date}}
                ]
        }
        criteria = {'start_date': start_date, 'end_date': end_date}
    elif mode == 'pct':
        if not pct:
            raise ValueError('pct is required when mode="pct"')
        if not isinstance(pct, int) and not isinstance(pct, float):
            raise TypeError('pct must be a decimal number between 0-1 (30% should be supplied as 0.3)')
        if 1 < pct < 0:
            raise ValueError('pct must be a decimal number between 0-1 (30% should be supplied as 0.3)')
        params = {}
        criteria = {'pct': pct}
    elif mode == 'count':
        if not count:
            raise ValueError('count is required when mode="count"')
        if not isinstance(count, int):
            raise TypeError('count must be supplied as an integer')
        params = {}
        criteria = {'count': count}
    else:
        raise ValueError('mode must be either "all", "dates", "pct", or "count"')

    params['label'] = {'$exists': True, '$ne': None}

    # pred_col = get_mongo_collection(database='pocMLModelMonitoring', collection='iris_knn')

    pred_docs = collection.find(params, {'prediction': 1, 'label': 1}).sort('prediction_timestamp', pymongo.DESCENDING)
    pred_docs = [doc for doc in pred_docs]

    if mode == 'pct':
        num_docs = math.ceil(len(pred_docs) * pct)
        pred_docs = pred_docs[:num_docs]
    elif mode == 'count':
        if count < len(pred_docs):
            pred_docs = pred_docs[:count]
        else:
            # if count is greater than the number of docs, include this in the criteria
            criteria = {'count': count, 'found': len(pred_docs)}

    if not pred_docs:
        # exit the function if nothing is found
        return

    dff = pd.DataFrame(pred_docs)

    target_names = ['setosa', 'versicolor', 'virginica']

    # calculate the confusion matrix
    # adding the labels ensures the correct index order for the species
    conf_matrix = confusion_matrix(dff['label'], dff['prediction'], labels=target_names)
    
    print(conf_matrix)

    # model accuracy
    accuracy = np.sum([conf_matrix[i][i] for i in range(conf_matrix.shape[0])]) / np.sum(conf_matrix)

    # precision, recall, and f1 scores
    precision = {}
    recall = {}
    f1 = {}

    targets_found = sorted(list(dff['label'].unique()))

    if 'setosa' in targets_found:
        precision['setosa'] = conf_matrix[0][0] / np.sum(conf_matrix[:, 0])
        recall['setosa'] = conf_matrix[0][0] / np.sum(conf_matrix[0])
        f1['setosa'] = 2 * (precision['setosa'] * recall['setosa']) / (precision['setosa'] + recall['setosa'])
    else:
        precision['setosa'] = None
        recall['setosa'] = None
        f1['setosa'] = None

    if 'versicolor' in targets_found:
        precision['versicolor'] = conf_matrix[1][1] / np.sum(conf_matrix[:, 1])
        recall['versicolor'] = conf_matrix[1][1] / np.sum(conf_matrix[1])
        f1['versicolor'] = 2 * (precision['versicolor'] * recall['versicolor']) / (
                    precision['versicolor'] + recall['versicolor'])
    else:
        precision['versicolor'] = None
        recall['versicolor'] = None
        f1['versicolor'] = None

    if 'virginica' in targets_found:
        precision['virginica'] = conf_matrix[2][2] / np.sum(conf_matrix[:, 2])
        recall['virginica'] = conf_matrix[2][2] / np.sum(conf_matrix[2])
        f1['virginica'] = 2 * (precision['virginica'] * recall['virginica']) / (
                    precision['virginica'] + recall['virginica'])
    else:
        precision['virginica'] = None
        recall['virginica'] = None
        f1['virginica'] = None

    sample_size = int(np.sum(conf_matrix))

    class_balance = {}
    for species in target_names:
        if species not in targets_found:
            class_balance[species] = 0.0
        else:
            class_balance[species] = dff['label'].value_counts()[species] / dff.shape[0]

    #     # get the mongo collection
    #     col = get_mongo_collection(database='pocMLModelMonitoring', collection='irisKnnPerformance')

    # synthesize the observation
    document = {
        'timestamp': dt.now(),
        'measurement_mode': mode,
        'query_criteria': criteria,
        'model_accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'F1': f1,
        'class_balance': class_balance,
        'sample_size': sample_size
    }

    #     # write the data
    #     result = col.insert_one(document)

    # return the result object
    # return result

    return document

In [185]:
document = get_model_performance(col, mode='all', start_date=None, end_date=None, pct=None, count=None)
document

[[10003     0     0]
 [  144  9687   452]
 [    0   313  9410]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 14, 49, 822772),
 'measurement_mode': 'all',
 'query_criteria': 'all records',
 'model_accuracy': 0.9697090872738179,
 'precision': {'setosa': 0.985808613383266,
  'versicolor': 0.9687,
  'virginica': 0.9541675116609207},
 'recall': {'setosa': 1.0,
  'versicolor': 0.9420402606243314,
  'virginica': 0.9678082896225445},
 'F1': {'setosa': 0.9928535980148884,
  'versicolor': 0.9551841443573434,
  'virginica': 0.9609394945111056},
 'class_balance': {'setosa': 0.3333333333333333,
  'versicolor': 0.34266386750641475,
  'virginica': 0.32400279916025193},
 'sample_size': 30009}

In [186]:
document = get_model_performance(col, mode='dates', start_date='2020-01-01', end_date='2021-12-31', pct=None, count=None)
document

[[2109    0    0]
 [  49 2786  133]
 [   0    0 2755]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 14, 55, 365303),
 'measurement_mode': 'dates',
 'query_criteria': {'start_date': datetime.datetime(2020, 1, 1, 0, 0),
  'end_date': datetime.datetime(2021, 12, 31, 0, 0)},
 'model_accuracy': 0.9767620020429009,
 'precision': {'setosa': 0.9772937905468025,
  'versicolor': 1.0,
  'virginica': 0.9539473684210527},
 'recall': {'setosa': 1.0, 'versicolor': 0.9386792452830188, 'virginica': 1.0},
 'F1': {'setosa': 0.9885165221467072,
  'versicolor': 0.9683698296836982,
  'virginica': 0.9764309764309764},
 'class_balance': {'setosa': 0.26927987742594484,
  'versicolor': 0.37895812053115424,
  'virginica': 0.3517620020429009},
 'sample_size': 7832}

In [188]:
document = get_model_performance(col, mode='dates', start_date='2019-01-01', end_date='2021-12-31', pct=None, count=None)
document

[[3631    0    0]
 [  80 4737  224]
 [   0    0 4703]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 15, 49, 902424),
 'measurement_mode': 'dates',
 'query_criteria': {'start_date': datetime.datetime(2019, 1, 1, 0, 0),
  'end_date': datetime.datetime(2021, 12, 31, 0, 0)},
 'model_accuracy': 0.9772710280373832,
 'precision': {'setosa': 0.9784424683373754,
  'versicolor': 1.0,
  'virginica': 0.9545362289425614},
 'recall': {'setosa': 1.0, 'versicolor': 0.9396945050585201, 'virginica': 1.0},
 'F1': {'setosa': 0.9891037864342141,
  'versicolor': 0.9689097975046022,
  'virginica': 0.9767393561786085},
 'class_balance': {'setosa': 0.2714766355140187,
  'versicolor': 0.3768971962616822,
  'virginica': 0.35162616822429904},
 'sample_size': 13375}

In [189]:
document = get_model_performance(col, mode='dates', start_date='2018-01-01', end_date='2021-12-31', pct=None, count=None)
document

[[5291    0    0]
 [ 111 6686  313]
 [   0    0 6640]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 15, 50, 892425),
 'measurement_mode': 'dates',
 'query_criteria': {'start_date': datetime.datetime(2018, 1, 1, 0, 0),
  'end_date': datetime.datetime(2021, 12, 31, 0, 0)},
 'model_accuracy': 0.9777322619610315,
 'precision': {'setosa': 0.9794520547945206,
  'versicolor': 1.0,
  'virginica': 0.9549834603768158},
 'recall': {'setosa': 1.0, 'versicolor': 0.940365682137834, 'virginica': 1.0},
 'F1': {'setosa': 0.9896193771626297,
  'versicolor': 0.9692664540446505,
  'virginica': 0.9769734422129038},
 'class_balance': {'setosa': 0.2778740612362796,
  'versicolor': 0.3734047581534583,
  'virginica': 0.3487211806102621},
 'sample_size': 19041}

In [190]:
document = get_model_performance(col, mode='dates', start_date='2015-01-01', end_date='2018-12-31', pct=None, count=None)
document

[[6372    0    0]
 [  64 4948  227]
 [   0  313 4704]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 15, 54, 144429),
 'measurement_mode': 'dates',
 'query_criteria': {'start_date': datetime.datetime(2015, 1, 1, 0, 0),
  'end_date': datetime.datetime(2018, 12, 31, 0, 0)},
 'model_accuracy': 0.9636757276882367,
 'precision': {'setosa': 0.9900559353635798,
  'versicolor': 0.9405056072989926,
  'virginica': 0.9539647130399513},
 'recall': {'setosa': 1.0,
  'versicolor': 0.9444550486734109,
  'virginica': 0.9376121187960933},
 'F1': {'setosa': 0.9950031230480949,
  'versicolor': 0.9424761904761905,
  'virginica': 0.9457177322074789},
 'class_balance': {'setosa': 0.3832090449843637,
  'versicolor': 0.3150709646379601,
  'virginica': 0.3017199903776762},
 'sample_size': 16628}

In [191]:
document = get_model_performance(col, mode='dates', start_date='2021-01-01', end_date='2021-12-31', pct=None, count=None)
document

[[656   0   0]
 [ 15 820  36]
 [  0   0 854]]


{'timestamp': datetime.datetime(2021, 6, 8, 12, 16, 10, 514602),
 'measurement_mode': 'dates',
 'query_criteria': {'start_date': datetime.datetime(2021, 1, 1, 0, 0),
  'end_date': datetime.datetime(2021, 12, 31, 0, 0)},
 'model_accuracy': 0.9785804283914322,
 'precision': {'setosa': 0.977645305514158,
  'versicolor': 1.0,
  'virginica': 0.9595505617977528},
 'recall': {'setosa': 1.0, 'versicolor': 0.9414466130884042, 'virginica': 1.0},
 'F1': {'setosa': 0.9886963074604371,
  'versicolor': 0.9698403311649912,
  'virginica': 0.9793577981651376},
 'class_balance': {'setosa': 0.2755144897102058,
  'versicolor': 0.3658126837463251,
  'virginica': 0.3586728265434691},
 'sample_size': 2381}

In [197]:
years = [i for i in range(2016, 2022)]
months = [i for i in range(1, 13)]

In [201]:
results = []
for year in years:
    for month in months:
        if month != 12:
            if len(str(month+1)) == 1:
                str_month = f'0{month+1}'
            else:
                str_month = month+1
            end_date = f'{year}-{str_month}-01'
        else:
            end_date = f'{year+1}-01-01'
        
        document = get_model_performance(col, mode='dates', start_date=f'{year}-{month}-01', end_date=end_date, pct=None, count=None)
        results.append(document)

[[174   0   0]
 [  1 108   9]
 [  0 109   6]]
[[234   0   0]
 [  2 154   7]
 [  0 175   0]]
[[254   0   0]
 [  0 159   9]
 [  0  29 135]]
[[260   0   0]
 [  0 153   9]
 [  0   0 160]]
[[218   0   0]
 [  0 163   4]
 [  0   0 171]]
[[289   0   0]
 [  5 138   9]
 [  0   0 161]]

  f1['virginica'] = 2 * (precision['virginica'] * recall['virginica']) / (



[[271   0   0]
 [  0 138   8]
 [  0   0 176]]
[[255   0   0]
 [  1 158   7]
 [  0   0 173]]
[[224   0   0]
 [  1 142   4]
 [  0   0 147]]
[[286   0   0]
 [  7 173  10]
 [  0   0 166]]
[[237   0   0]
 [  1 167  13]
 [  0   0 155]]
[[256   0   0]
 [  1 167   9]
 [  0   0 172]]
[[235   0   0]
 [  2 160   7]
 [  0   0 163]]
[[280   0   0]
 [  3 151   8]
 [  0   0 144]]
[[255   0   0]
 [  1 175   4]
 [  0   0 181]]
[[286   0   0]
 [  3 184   4]
 [  0   0 141]]
[[250   0   0]
 [  3 178   5]
 [  0   0 190]]
[[238   0   0]
 [  2 164   6]
 [  0   0 164]]
[[210   0   0]
 [  0 169   7]
 [  0   0 165]]
[[269   0   0]
 [  3 172   9]
 [  0   0 158]]
[[157   0   0]
 [  3 150   8]
 [  0   0 149]]
[[125   0   0]
 [  3 159   8]
 [  0   0 159]]
[[125   0   0]
 [  3 146   8]
 [  0   0 166]]
[[118   0   0]
 [  2 157  10]
 [  0   0 157]]
[[133   0   0]
 [  4 160   7]
 [  0   0 157]]
[[116   0   0]
 [  2 164   7]
 [  0   0 198]]
[[122   0   0]
 [  3 157  12]
 [  0   0 172]]
[[132   0   0]
 [  2 183   4]
 [ 

In [202]:
results

[None,
 None,
 None,
 None,
 None,
 {'timestamp': datetime.datetime(2021, 6, 8, 12, 32, 18, 973297),
  'measurement_mode': 'dates',
  'query_criteria': {'start_date': datetime.datetime(2016, 6, 1, 0, 0),
   'end_date': datetime.datetime(2016, 7, 1, 0, 0)},
  'model_accuracy': 0.7076167076167076,
  'precision': {'setosa': 0.9942857142857143,
   'versicolor': 0.4976958525345622,
   'virginica': 0.4},
  'recall': {'setosa': 1.0,
   'versicolor': 0.9152542372881356,
   'virginica': 0.05217391304347826},
  'F1': {'setosa': 0.9971346704871061,
   'versicolor': 0.6447761194029851,
   'virginica': 0.09230769230769231},
  'class_balance': {'setosa': 0.4275184275184275,
   'versicolor': 0.28992628992628994,
   'virginica': 0.28255528255528256},
  'sample_size': 407},
 {'timestamp': datetime.datetime(2021, 6, 8, 12, 32, 19, 8287),
  'measurement_mode': 'dates',
  'query_criteria': {'start_date': datetime.datetime(2016, 7, 1, 0, 0),
   'end_date': datetime.datetime(2016, 8, 1, 0, 0)},
  'model_acc

In [206]:
results = [i for i in results if i]

In [207]:
results_col = db['testPerfResults2']

In [208]:
result = results_col.insert_many(results)
len(result.inserted_ids)

61