### What are we actually predicting? 

In [None]:
import pandas as pd
import yaml
from os.path import join
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import datetime as datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import postmodeling.analyze_labels as analyze_labels
from datetime import datetime
from utils.helpers import get_database_connection
from utils.constants import PREDICTIONS_DIR, CONFIGS_PATH
from postmodeling.evaluation import get_test_pred_labels_from_csv, get_models_info, get_model_info_from_experiment_ids


In [None]:
# Get database connection
db_conn = get_database_connection()

# Set variables of interest
model_ids = [434, 418, 145, 146, 433]
model_id = 434
experiment_ids = [151]
joco_k = 75
doco_k = 40

# get validation period
config_path = join(CONFIGS_PATH, 'config_both_lr-dt-bl_label_12345611121314_both.yaml')
with open(config_path) as f:
    config = yaml.safe_load(f)
config = config['labels']
months_future = str(config['months_future'])


In [None]:
# Get tables for just the relevant validation period
joco_df, doco_df, both_df = analyze_labels.get_preds_split_labels(db_conn, model_ids)

# Get tables for all time in the future:
joco_df_all, doco_df_all, both_df_all = analyze_labels.get_preds_split_labels(db_conn, model_ids, label_tablename = 'split_labels_all_time')


### Counts of the Number of People with each type of label

In [None]:

print('\nCOUNTS FOR ALL TIME IN FUTURE OF THE AS OF DATE')

label_counts_all = analyze_labels.get_label_aggregations(both_df_all)
display(label_counts_all)


print('COUNTS FOR THE VALIDATION PERIOD OF ' + months_future + ' MONTHS')
label_counts = analyze_labels.get_label_aggregations(both_df)
display(label_counts)




In [None]:
# Plot for all time in the future of the as of date:

p = analyze_labels.plot_split_labels(label_counts_all, model_id, months_future = 'any')


# Plot just within the validation window:
p = analyze_labels.plot_split_labels(label_counts, model_id, months_future = months_future, xmax = p.get_xlim()[1])


### Counts of each flag
i.e. could have multiple counts per person, if they have a qualifying event multiple times

In [None]:
# Get the data for the specific model
specific_model_df = both_df[both_df['model_id'] == model_id].sort_values('county_k')

# Get the joid list for everyone below k
joid_list = list(specific_model_df['joid'])

# Get the as of date for that model_id
as_of_date = specific_model_df['as_of_date'].dt.date.unique()[0]



In [None]:
# Print out for every ambulance, ER, or death event for each person, and the relevant flags for that event

joid_list_all_future_events = analyze_labels.get_all_flagged_events(db_conn, joid_list, as_of_date)
display(joid_list_all_future_events)



In [None]:
# Print a table with the amounts for each flag
event_counts = pd.DataFrame(joid_list_all_future_events.sum(axis = 0).drop(labels = ['joid', 'source'])).rename(columns = {0: 'all_time'})

validation_end_date = as_of_date + relativedelta(months=int(months_future))
joid_list_all_future_events_end = analyze_labels.get_all_flagged_events(db_conn, joid_list, validation_end_date)
event_counts['validation_period'] = joid_list_all_future_events.sum(axis = 0).drop(labels = ['joid', 'source']) - joid_list_all_future_events_end.sum(axis = 0).drop(labels = ['joid', 'source']) 
display(event_counts)