In [None]:
import datarobot as dr
from datarobot import Project, Deployment
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

import datetime as dt
from datetime import datetime
import dateutil.parser
import os
import re 
from importlib import reload
import random
import math
import numpy.ma as ma
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set Pandas configuration to show all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
dr.Client(config_path='../drconfig.yaml');

### Read in data

In [None]:
df = pd.read_csv(""" Enter Code """)

df.head(5)

### Plot

In [None]:
df.plot(x='Date', y='Rides', figsize=(20, 8));

## Months (Length of training data)

In [None]:
MONTHS = '' 

def months(df):
    global MONTHS
    MIN_DATE = df['Date'].min()
    MAX_DATE = df['Date'].max()
    MONTHS = str(int((MAX_DATE - MIN_DATE).days / 30))
     
    print('Min Date: ', MIN_DATE)
    print('Max Date: ', MAX_DATE)
    print('Months:   ', MONTHS)

In [None]:
months(df)

## Create TS Settings

In [None]:
DATE      = """ Enter Code """
TARGET    = """ Enter Code """

PROJECT_NAME = 'Lab_3'

VERSION = '1'
MODE    = 'Q'
    
FDWS = """ Enter Code """

FDS  = """ Enter Code """ 


BASE   = 'L3_1_V:'

PREFIX = BASE + VERSION + '_Mnths:' + MONTHS + '_Mode:' + MODE
DATASET_FILENAME = 'Months_' + MONTHS
MAX_WAIT = 14400
READ_TIMEOUT = 14400

HOLDOUT_START_DATE  = None
VALIDATION_DURATION = None
HOLDOUT_DURATION    = None
NUMBER_BACKTESTS    = 4 # Keep Backtests at 4
GAP_DURATION        = None 

FEATURE_SETTINGS = []

CAL_ID = None

print(FEATURE_SETTINGS)
print(CAL_ID)

## Create Project

### Create function to create projects

In [None]:
def create_dr_project(df, project_name, fw_start=None, fw_end=None, fdw_start=None, fdw_end=None, dataset_filename=DATASET_FILENAME):
    
    ###############################
    # Create Datetime Specification
    ###############################
    # SERIES_COL = [SERIES]
    time_partition = dr.DatetimePartitioningSpecification(
        datetime_partition_column = DATE,
        forecast_window_start     = fw_start, 
        forecast_window_end       = fw_end,
        feature_derivation_window_start = fdw_start,
        feature_derivation_window_end   = fdw_end,
        holdout_start_date        = HOLDOUT_START_DATE ,
        validation_duration       = VALIDATION_DURATION,  
        holdout_duration          = HOLDOUT_DURATION,
        gap_duration              = GAP_DURATION,
        number_of_backtests       = NUMBER_BACKTESTS, 
        feature_settings          = FEATURE_SETTINGS,
        use_time_series           = True,
        calendar_id               = CAL_ID
      )
     

    ################
    # Create Project
    ################
    project = dr.Project.create(
        project_name = project_name, 
        sourcedata   = df, 
        max_wait     = MAX_WAIT, 
        read_timeout = READ_TIMEOUT,
        dataset_filename = DATASET_FILENAME
    )
    print("Post-Project MB: ", (df.memory_usage(index=True).sum()/1024/1024).round(2))
    print("Post-Project Records: {:,}".format(len(df)))
    print(f'Project {project_name} Created...')

    #################
    # Start Autopilot
    #################
    project.set_target(
        target = TARGET,   
        metric = None,      
        mode   = dr.AUTOPILOT_MODE.QUICK , # dr.AUTOPILOT_MODE.FULL_AUTO,
        #advanced_options = opts,
        worker_count = -1,
        partitioning_method = time_partition,
        max_wait = MAX_WAIT
    )
    return project


## Model Factory

### Function to loop through the various FDWs & FDs

In [None]:
projects = []  # Keep List of all projects

In [None]:
def model_factory(df, FDWS, FDS):
    PREFIX = BASE + str(VERSION) + '_Mnths:' + MONTHS + '_Mode:' + MODE
    DATASET_FILENAME = 'Months_' + MONTHS
    
    for fdw in FDWS:
        for fd in FDS:
            fd_start  = fd[0] 
            fd_end    = fd[1]
            fdw_start = fdw[0]
            fdw_end   = fdw[1]

            # Name project
            project_name = f"{PREFIX}_FDW:{fdw_start}-{fdw_end}_FD:{fd_start}-{fd_end}"  
            print(project_name)

            data = df.copy() 

            # Create project
            project = create_dr_project(data, project_name, 
                                        fw_start=fd_start, fw_end=fd_end, 
                                        fdw_start=fdw_start, fdw_end=fdw_end,
                                        dataset_filename=DATASET_FILENAME)

            projects.append(project) 

## Default Model

In [None]:
model_factory(df, FDWS, FDS)

## Holiday Optimize

In [None]:
# Update Version
VERSION = '2'

In [None]:
HOLDOUT_START_DATE  = """ Enter Code """ 
VALIDATION_DURATION = dr.helpers.partitioning_methods.construct_duration_string(""" Enter Code """)
HOLDOUT_DURATION    = dr.helpers.partitioning_methods.construct_duration_string(""" Enter Code """) 

In [None]:
model_factory(df, FDWS, FDS)

# Pull Results

## Get Project Names in a List

In [None]:
projects = dr.Project.list(search_params={'project_name': BASE}) 
projects

## Get Project Names and PIDs in a List

In [None]:
lst = []

for p in projects:
    r = ((p, p.id))
    lst.append(r)
lst

## Unlock Holdouts

In [None]:
for i in lst:
    project = Project.get(i[1])
    project.unlock_holdout()

## Compute Backtests for Blenders

In [None]:
for i in lst:
    project = Project.get(i[1])
    lb = project.get_datetime_models()
    for model in lb:
        
        if 'Blender' in model.model_type:
            try:
                print(project.project_name, model)  
                dr.DatetimeModel.score_backtests(model) 
                print(f'Computing backtests for model {model.id} in Project {project.project_name}')
            except dr.errors.ClientError:
                pass
            print(f'All available backtests have been submitted for scoring for project {project.project_name}')
            print(' ')
        else:
            None 

## Compute All Backtests for Top Models in Backtest 1 and Holdout groups

In [None]:
OPTIMIZE_GROUP = ['validation', 'holdout']

In [None]:
PROJECT_METRIC = project.metric
METRICS = list(set([PROJECT_METRIC, 'MASE', 'RMSE']))

In [None]:
for p in lst :
    for met in METRICS:
        for o in OPTIMIZE_GROUP:
            project = Project.get(p[1])
            lb = project.get_datetime_models()

            best_models = sorted(
                                [model for model in lb if model.metrics[met][o]],  
                                key=lambda m: m.metrics[met][o],
                                )[0:3]
            
            for mod in best_models:

                if mod.metrics[met]["backtesting"] == None:
                    try:
                        print(project.project_name, mod)  
                        dr.DatetimeModel.score_backtests(mod) 
                        print(f'Computing backtests for model {mod.model_type} in Project {project.project_name}')
                    except dr.errors.ClientError:
                        pass
                    print(f'All available backtests have been submitted for scoring for project {project.project_name}')
                    print(' ')
                else:
                    print(project.project_name)
                    print(f'{mod.model_type} All Backtests Already Computed')
                    print(' ')

## Get Project and Model Scores in a DataFrame

In [None]:
OPTIMIZATION_PERIOD = 'validation'  # BackTest 1: validation  All Backtest: backtesting  Holdout: holdout 

In [None]:
models = []
scores = pd.DataFrame()


for p in lst:
    project = Project.get(p[1])
    lb = project.get_datetime_models()
    best_model = sorted(
                        [model for model in lb if model.metrics[project.metric][OPTIMIZATION_PERIOD]],  
                        key=lambda m: m.metrics[project.metric][OPTIMIZATION_PERIOD],
                        )[0]

    backtest_scores = pd.DataFrame(
        [
            {
                'Project_Name': project.project_name,
                'Project_ID': project.id,
                'Model_ID': best_model.id,
                'Model_Type': best_model.model_type,
                'Featurelist': best_model.featurelist_name,
                'Optimization_Metric': project.metric,
                'Scores': best_model.metrics,
            }
        ]
    )
    scores = scores.append(backtest_scores, sort=False).reset_index(drop=True)  


print(f'Scores for all {len(projects)} projects have been computed')
print('')

scores = scores.join(json_normalize(scores["Scores"].tolist())).drop(labels=['Scores'], axis=1) 

# Drop Empty Columns
scores = scores[scores.columns.drop(list(scores.filter(regex='crossValidation$')))]

# Rename Columns
scores.columns = scores.columns.str.replace(".backtesting", "_All_BT")
scores.columns = scores.columns.str.replace(".holdout", "_Holdout")
scores.columns = scores.columns.str.replace(".validation", "_BT_1")
scores.columns = scores.columns.str.replace(' ', '_')

scores = scores[scores.columns.drop(list(scores.filter(regex='_All_BTScores$')))]

scores.head(2)


### Select subset of columns into varibles for easy reference

In [None]:
METRICS = scores.filter(regex='MASE|RMSE').columns.to_list()
PROJECT = ['Project_Name', 'Project_ID', 'Model_ID', 'Model_Type', 'Featurelist']
COLS = PROJECT + METRICS

In [None]:
scores[COLS]

In [None]:
scores['FDW_Start'] = scores['Project_Name'].str.extract(r'FDW:(-\d{1,2})')
scores['FDW_End']   = scores['Project_Name'].str.extract(r'FDW:-\d{1,2}-(\d{1,2})_')
scores['FD_Start']  = scores['Project_Name'].str.extract(r'FD:(\d{1,2})')
scores['FD_End']    = scores['Project_Name'].str.extract(r'FD:\d{1,2}-(\d{1,2})')
scores['Months']    = scores['Project_Name'].str.extract(r'_Mnths:(\d{1,2})_')

scores.rename(columns={'All_Backtests_Poisson Deviance':'All_Backtests_Poisson_Deviance', 
                       'Backtest_1_Poisson Deviance':'Backtest_1_Poisson_Deviance',
                       'Holdout_Poisson Deviance':'Holdout_Poisson_Deviance',
                       'Holdout_Tweedie Deviance':'Holdout_Tweedie_Deviance',
                       'All_Backtests_Tweedie Deviance':'All_Backtests_Tweedie_Deviance',
                       'Backtest_1_Tweedie Deviance':'Backtest_1_Tweedie_Deviance',
                       'Holdout_Tweedie Deviance':'Holdout_Tweedie_Deviance'}, inplace=True)


META = ['FDW_Start', 'FDW_End', 'FD_Start', 'FD_End', 'Months']
MORE = PROJECT + META + METRICS 


# Sort by the correct partition
scores[MORE].sort_values(by=[""" Enter Code """], ascending=True)

## Get Best Model

In [None]:
hrmse = scores.loc[scores[""" Enter Code """].notnull()]

# Take the Single Best model
# hrmse_best = pd.DataFrame(hrmse.loc[hrmse.MASE_All_BT.idxmin()]).transpose()

# Take the Best model by Project Name
hrmse_best = hrmse.loc[hrmse.groupby('Project_Name').RMSE_BT_1.idxmin()]

best_models = pd.DataFrame(hrmse_best) 
best_models

## Generate Predictions

### Holiday Optimized

### Select the record with the project that has the holdout set 

In [None]:
RECORD = """ Enter Code """

In [None]:
PID = best_models['Project_ID'].values[RECORD]
MID = best_models['Model_ID'].values[RECORD]

project = dr.Project.get(PID)
model   = dr.Model.get(PID, MID)
print(project)
print(model)

## Retrain on Frozen parameters

### Enter the start and end dates to re-train the model on with frozen parameters

In [None]:
TRAINING_START_DATE    = pd.to_datetime(""" Enter Code """)
TRAINING_END_DATE      = pd.to_datetime(""" Enter Code """)

In [None]:
job = model.request_frozen_datetime_model( training_row_count     = None, 
                                           training_duration      = None,
                                           training_start_date    = TRAINING_START_DATE,
                                           training_end_date      = TRAINING_END_DATE,
                                           time_window_sample_pct = None  
                                          )

retrained_model = job.get_result_when_complete()

In [None]:
PID = best_models['Project_ID'].values[RECORD]
MID = best_models['Model_ID'].values[RECORD]

project = dr.Project.get(PID)
print(project)

In [None]:
model   = retrained_model 

dataset = project.upload_dataset(""" Enter Code """)

pred_job = model.request_predictions(dataset_id = dataset.id)

H_preds = pred_job.get_result_when_complete()

H_preds['timestamp']      = pd.to_datetime(H_preds['timestamp'], utc=True)
H_preds['forecast_point'] = pd.to_datetime(H_preds['forecast_point'], utc=True)

H_preds.rename(columns={'timestamp':'Date', 'prediction':'Holiday_Pred'}, inplace=True)

H_preds.head(5)

## Summer Optimized

### Select the record with the project that has the default partitioning

In [None]:
RECORD = """ Enter Code """

# Verify correct project
PID = best_models['Project_ID'].values[RECORD]
MID = best_models['Model_ID'].values[RECORD]

project = dr.Project.get(PID)
model   = dr.Model.get(PID, MID)
print(project, model)

In [None]:
PID = best_models['Project_ID'].values[RECORD]
MID = best_models['Model_ID'].values[RECORD]

project = dr.Project.get(PID)
print(project.project_name)
print(" ")

model   = dr.Model.get(PID, MID)

dataset = project.upload_dataset(""" Enter Code """)

pred_job = model.request_predictions(dataset_id = dataset.id)

S_preds = pred_job.get_result_when_complete()

S_preds['timestamp']      = pd.to_datetime(S_preds['timestamp'], utc=True)          
S_preds['forecast_point'] = pd.to_datetime(S_preds['forecast_point'], utc=True)

S_preds.rename(columns={'timestamp':'Date', 'prediction':'Summer_Pred'}, inplace=True)

S_preds.head(5)

## Actuals

### Read in the CTA_actuals dataset

In [None]:
actuals = pd.read_csv(""" Enter Code """)

actuals['Date'] =  pd.to_datetime(actuals['Date'], utc=True)      
actuals.head(5)

## Compare Summer vs Holiday Model Forecasts

### Merge the Actuals, Summer, and Fall datasets

In [None]:
results = """ Enter Code """
results['Summer_Pred']  = results['Summer_Pred'].astype(int)
results['Holiday_Pred'] = results['Holiday_Pred'].astype(int)
results.head(5)

In [None]:
results.plot(x='Date', figsize=(22, 8));

## Compare MSE

In [None]:
actuals = results.Rides
summer  = results.""" Enter Code """
holiday = results.""" Enter Code """

Summer_rmse  = mean_squared_error(""" Enter Code """, """ Enter Code """, squared=""" Enter Code """)
Holiday_rmse = mean_squared_error(""" Enter Code """, """ Enter Code """, squared=""" Enter Code """)

print('Summer RMSE:  {:,.2f}'.format(Summer_rmse))
print('Holiday RMSE: {:,.2f}'.format(Holiday_rmse))