## Imports / Global Variables

In [2]:
# Imports
import os
import numpy as np
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import pickle
import warnings
warnings.simplefilter(action='ignore', category=Warning)

# Global Path Variables
PARENT_DIRECTORY = os.pardir

# Global Paths to Data Folders
RAW_DATA_FOLDER = os.path.join(PARENT_DIRECTORY, os.path.join('data', 'raw'))
PROCESSED_DATA_FOLDER = os.path.join(PARENT_DIRECTORY, os.path.join('data', 'processed'))

# Specify date for generating validation and test predictions
DATE_NOW = '2022-06-25'

## Load the Data

In [3]:
# Set path to local MySQL password file
sql_pw_filepath = os.path.join(PARENT_DIRECTORY, 'sql_password.txt')

# Raise exception if key file not found
if not os.path.exists(sql_pw_filepath):
    raise FileNotFoundError('Local MySQL password file not found! Please check directory.')

# Read in MySQL username and password as environment variable
with open(sql_pw_filepath, 'r') as f:
    os.environ['sql_username'] = f.readline().strip()
    os.environ['sql_password'] = f.readline().strip()

In [4]:
# Create DB connection
db_connection_str = f'mysql+pymysql://{os.environ.get("sql_username")}:{os.environ.get("sql_password")}@aipi510.mysql.database.azure.com:3306/project'
db_connection_args = {'ssl': {'enable_tls': True}}
sql_engine = create_engine(db_connection_str, connect_args=db_connection_args)
db_connection= sql_engine.connect()

In [10]:
# Read in main data from DB
main_sql_query = 'SELECT * FROM train_data'
df_main = pd.read_sql(main_sql_query, db_connection)

# Drop the index column from the data
df_main = df_main.drop(columns=['index'])

## Load the Model

In [11]:
# Retrieve trained model from DB
sql_retrieve_pickle_model = "SELECT * FROM model LIMIT 1"
cursor_result = db_connection.execute(sql_retrieve_pickle_model)
result = cursor_result.fetchall()
model_date, rf_models = result[0][0], pickle.loads(result[0][1])

In [12]:
# Calculate additional dates
date_now = pd.to_datetime(DATE_NOW)
date_val = date_now - pd.to_timedelta(49, unit='D')
date_val_filter = date_now - pd.to_timedelta(21, unit='D')
date_test = date_now + pd.to_timedelta(28, unit='D')

## Generate Predictions

#### Validation Predictions

In [13]:
# Create dataframe for storing validation predictions
preds_val = pd.DataFrame()
for county in df_main['County'].unique():
    data_county = df_main[df_main['County'] == county]
    val_data = data_county[(data_county['date'] >= date_val) & (data_county['date'] <= date_now)].reset_index(drop=True)
    preds_county = val_data[['date', 'County', 'Count']]
    X = val_data.drop(columns=['date', 'County', 'Count', 'Count_p1', 'Count_p2', 'Count_p3', 'Count_p4'])
    # Perform predictions
    preds_w1 = rf_models[county][0].predict(X)
    preds_w2 = rf_models[county][1].predict(X)
    preds_w3 = rf_models[county][2].predict(X)
    preds_w4 = rf_models[county][3].predict(X)
    # Store predictions in new columns
    preds_county['Preds_w1'] = preds_w1
    preds_county['Preds_w2'] = preds_w2
    preds_county['Preds_w3'] = preds_w3
    preds_county['Preds_w4'] = preds_w4
    # Shift predictions forward to the appropriate date rows as they are meant to be made in advance
    preds_county['Preds_w1'] = preds_county['Preds_w1'].shift(periods=1)
    preds_county['Preds_w2'] = preds_county['Preds_w2'].shift(periods=2)
    preds_county['Preds_w3'] = preds_county['Preds_w3'].shift(periods=3)
    preds_county['Preds_w4'] = preds_county['Preds_w4'].shift(periods=4)
    # Round up predictions to whole numbers
    preds_county['Count'] = np.round(preds_county['Count'])
    preds_county['Preds_w1'] = np.round(preds_county['Preds_w1'])
    preds_county['Preds_w2'] = np.round(preds_county['Preds_w2'])
    preds_county['Preds_w3'] = np.round(preds_county['Preds_w3'])
    preds_county['Preds_w4'] = np.round(preds_county['Preds_w4'])
    # Calculate residuals/errors of predictions
    preds_county['Residual_w1'] = preds_county['Count'] - preds_county['Preds_w1']
    preds_county['Residual_w2'] = preds_county['Count'] - preds_county['Preds_w2']
    preds_county['Residual_w3'] = preds_county['Count'] - preds_county['Preds_w3']
    preds_county['Residual_w4'] = preds_county['Count'] - preds_county['Preds_w4']
    # Merge predictions with original date and county data
    preds_val = pd.concat([preds_val, preds_county])

In [35]:
# Filter predictions
preds_val = preds_val[preds_val['date'] >= date_val_filter]

In [37]:
# Add new column to record date forecast was made
preds_val['Forecasted_on'] = pd.to_datetime(preds_val['date']) - pd.to_timedelta(7, unit='D')
# Add new column to record most recent prediction
preds_val['pred_cases'] = preds_val['Preds_w1']

In [39]:
# Examine the Validation Predictions
preds_val.head()

Unnamed: 0,date,County,Count,Preds_w1,Preds_w2,Preds_w3,Preds_w4,Residual_w1,Residual_w2,Residual_w3,Residual_w4,Forecasted_on,pred_cases
4,2022-06-04,ALBANY,17.0,23.0,21.0,20.0,24.0,-6.0,-4.0,-3.0,-7.0,2022-05-28,23.0
5,2022-06-11,ALBANY,17.0,14.0,18.0,19.0,25.0,3.0,-1.0,-2.0,-8.0,2022-06-04,14.0
6,2022-06-18,ALBANY,14.0,31.0,17.0,13.0,19.0,-17.0,-3.0,1.0,-5.0,2022-06-11,31.0
7,2022-06-25,ALBANY,2.0,25.0,29.0,11.0,4.0,-23.0,-27.0,-9.0,-2.0,2022-06-18,25.0
4,2022-06-04,ALLEGANY,7.0,7.0,7.0,7.0,6.0,0.0,0.0,0.0,1.0,2022-05-28,7.0


#### Test Predictions

In [14]:
# Create dataframe for storing test predictions
preds_test = pd.DataFrame()
for county in df_main['County'].unique():
    data_county = df_main[df_main['County'] == county]
    test_data = data_county[(data_county['date'] >= date_val_filter) & (data_county['date'] <= date_test)].reset_index(drop=True)
    preds_county = test_data[['date', 'County']]
    test_data = test_data[(test_data['date'] >= date_val_filter) & (test_data['date'] <= date_now)]
    X = test_data.drop(columns=['date', 'County', 'Count', 'Count_p1', 'Count_p2', 'Count_p3', 'Count_p4'])
    # Perform predictions
    preds_w1 = rf_models[county][0].predict(X)
    preds_w2 = rf_models[county][1].predict(X)
    preds_w3 = rf_models[county][2].predict(X)
    preds_w4 = rf_models[county][3].predict(X)
    # Store predictions in new columns
    preds_county['Preds_w1'] = pd.Series(preds_w1)
    preds_county['Preds_w2'] = pd.Series(preds_w2)
    preds_county['Preds_w3'] = pd.Series(preds_w3)
    preds_county['Preds_w4'] = pd.Series(preds_w4)
    # Shift predictions forward to the appropriate date rows as they are meant to be made in advance
    preds_county['Preds_w1'] = preds_county['Preds_w1'].shift(periods=1)
    preds_county['Preds_w2'] = preds_county['Preds_w2'].shift(periods=2)
    preds_county['Preds_w3'] = preds_county['Preds_w3'].shift(periods=3)
    preds_county['Preds_w4'] = preds_county['Preds_w4'].shift(periods=4)
    # Round up predictions to whole numbers
    preds_county['Preds_w1'] = np.round(preds_county['Preds_w1'])
    preds_county['Preds_w2'] = np.round(preds_county['Preds_w2'])
    preds_county['Preds_w3'] = np.round(preds_county['Preds_w3'])
    preds_county['Preds_w4'] = np.round(preds_county['Preds_w4'])
    # Merge predictions with original date and county data
    preds_test = pd.concat([preds_test, preds_county])

In [15]:
# Filter predictions
preds_test = preds_test[preds_test['date'] > date_now]

In [16]:
# Add new column to record most recent prediction
preds_test['pred_cases'] = 0
preds_test['pred_cases'][pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(21, unit='D'))] = preds_test[pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(21, unit='D'))]['Preds_w1']
preds_test['pred_cases'][pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(14, unit='D'))] = preds_test[pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(14, unit='D'))]['Preds_w2']
preds_test['pred_cases'][pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(7, unit='D'))] = preds_test[pd.to_datetime(preds_test['date']) == (date_test - pd.to_timedelta(7, unit='D'))]['Preds_w3']
preds_test['pred_cases'][pd.to_datetime(preds_test['date']) == date_test] = preds_test[pd.to_datetime(preds_test['date']) == date_test]['Preds_w4']

In [17]:
# Add new column to record date forecast was made
preds_test['Forecasted_on'] = date_now

In [19]:
preds_test

Unnamed: 0,date,County,Preds_w1,Preds_w2,Preds_w3,Preds_w4,pred_cases,Forecasted_on
4,2022-07-02,ALBANY,19.0,20.0,23.0,3.0,19,2022-06-25
5,2022-07-09,ALBANY,,20.0,21.0,30.0,20,2022-06-25
6,2022-07-16,ALBANY,,,16.0,29.0,16,2022-06-25
7,2022-07-23,ALBANY,,,,4.0,4,2022-06-25
4,2022-07-02,ALLEGANY,6.0,5.0,4.0,5.0,6,2022-06-25
...,...,...,...,...,...,...,...,...
7,2022-07-23,WYOMING,,,,2.0,2,2022-06-25
4,2022-07-02,YATES,5.0,6.0,6.0,3.0,5,2022-06-25
5,2022-07-09,YATES,,6.0,7.0,4.0,6,2022-06-25
6,2022-07-16,YATES,,,6.0,10.0,6,2022-06-25


In [20]:
# Merge validation and test predictions
preds_merged = pd.concat([preds_val, preds_test])
# Merge additional predictions from main data
data_add = df_main[(df_main['date'] >= date_val) & (df_main['date'] < date_val_filter)][['date', 'County', 'Count']]
preds_merged = pd.concat([preds_merged, data_add])

In [21]:
# Calculate 1 - Mean Absolute Percentage Error
preds_merged["acc"] = np.round((1 - (np.abs((preds_merged["Count"] - preds_merged["pred_cases"])) / preds_merged["pred_cases"])) * 100, 1)

# Convert Forecasted_on to type string
preds_merged['Forecasted_on'] = preds_merged['Forecasted_on'].astype(str)

In [22]:
# Rename columns in dataframe
col_map = {
    "date": "Forecast Date", 
    "Count": "Actual Cases", 
    "Preds_w1": "Prediction (W - 1)", 
    "Preds_w2": "Prediction (W - 2)", 
    "Preds_w3": "Prediction (W - 3)", 
    "Preds_w4": "Prediction (W - 4)",
    "pred_cases": "Predicted Cases", 
    "Forecasted_on": "Forecasted On",
    "acc": "Accuracy (%)", 
    "Residual_w1": "Residuals (W - 1)", 
    "Residual_w2": "Residuals (W - 2)", 
    "Residual_w3": "Residuals (W - 3)", 
    "Residual_w4": "Residuals (W - 4)"
}
preds_merged = preds_merged.rename(columns=col_map)

# Display the changes
display(preds_merged.head())

Unnamed: 0,Forecast Date,County,Actual Cases,Prediction (W - 1),Prediction (W - 2),Prediction (W - 3),Prediction (W - 4),Residuals (W - 1),Residuals (W - 2),Residuals (W - 3),Residuals (W - 4),Predicted Cases,Forecasted On,Accuracy (%)
0,2022-05-07,ALBANY,43.0,,,,,,,,,,NaT,
1,2022-05-14,ALBANY,49.0,41.0,,,,8.0,,,,,NaT,
2,2022-05-21,ALBANY,51.0,42.0,39.0,,,9.0,12.0,,,,NaT,
3,2022-05-28,ALBANY,13.0,21.0,19.0,21.0,,-8.0,-6.0,-8.0,,,NaT,
4,2022-06-04,ALBANY,17.0,23.0,21.0,20.0,24.0,-6.0,-4.0,-3.0,-7.0,,NaT,


## Write Predictions to the DB

In [None]:
# Write predictions dataframe to DB
preds_table = 'predictions'
try:
    preds_merged.to_sql(preds_table, db_connection, if_exists='replace')
except ValueError as vx:
    print(vx)
except Exception as ex:
    print(ex)
else:
    print(f'Table {preds_table} created/updated successfully!')

In [None]:
# Close DB connection
db_connection.close()