# Chart of training/test sets over time
For a given set of models (identified by the model comment), plot the time periods for each training/test split

In [None]:
%load_ext autoreload
%autoreload 2

%pylab inline

In [None]:
import pandas as pd
import seaborn as sns
from sqlalchemy import create_engine
import os
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import matplotlib.patches as mpatches
import itertools
import logging

logging.basicConfig(level=logging.DEBUG)

from IPython.display import Image
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
dburl = os.environ['DBURL']
engine = create_engine(dburl)

------

In [None]:
model_comment = 'ucm_access_6months_production_models'

---- 
<a id='time_chop'></a>

# Checking time splits

In [None]:
#grab time split 
with open('../../../pipeline_CDPH_3.0/analysis/sql/time_splits.sql') as f:
    q = f.read()
q = q.replace('model_comment_holder',model_comment)
q = f"""
    select
        distinct
        a.evaluation_start_time as train_start_time,
        a.evaluation_end_time as train_end_time,
        b.evaluation_start_time as test_start_time,
        b.evaluation_end_time as test_end_time,
        m_id.training_label_timespan as train_label_timespan,
        mat.labeling_window as test_label_timespan
    from model_metadata.models as m_id
    left join train_results.evaluations as a using (model_id)
    left join test_results.evaluations as b using (model_id)
    left join model_metadata.matrices as mat
        on mat.matrix_uuid = b.matrix_uuid
    where
        a.metric = 'precision@'
        and a.parameter ='100.0_pct'
        and model_group_id = 20854
    order by train_start_time, train_end_time, test_start_time, test_end_time desc
;
    """
q

In [None]:
df_time_split = pd.read_sql(q, engine,
                            parse_dates=['train_start_time','train_end_time','test_start_time','test_end_time'])

In [None]:
df_time_split

In [None]:
sns.set_style("whitegrid")
sns.set_context("poster", font_scale=1, rc={"lines.linewidth": 1,"lines.markersize":4})

fig, ax = plt.subplots(1, figsize=(16, 8))
for y, time_val in df_time_split.iterrows():
    train_start, train_end, test_start, test_end, train_label_timespan, test_label_timespan = time_val
    _ = plt.plot([train_start,train_end], [y,y],marker='o',color='red')
    _ = plt.plot([train_end,train_end+train_label_timespan], [y,y],marker='',linestyle='--',color='red')
    _ = plt.plot([test_start,test_start+test_label_timespan], [y,y],marker='o',linestyle='-',color='blue')
    _ = plt.plot([test_start,test_end+test_label_timespan], [y,y],marker='',linestyle='--',color='blue')
_ = ax.axes.yaxis.set_ticklabels([])
_ = plt.ylabel('Model Building and\nValidation Cohorts')
_ = plt.xlabel('Time')
legend_list = []
legend_list.append(mpatches.Patch(color='red', label='Model Building Cohort'))
legend_list.append(mpatches.Patch(color='blue', label='Validation Cohort'))

# display the graph.
plt.legend(handles=legend_list)
 
sns.despine()
plt.show()