In [27]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import numpy as np
import os
#from pmdarima import auto_arima
import sys
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import  SARIMAX
import time
from tqdm.notebook import tqdm
from google.colab import files

# Setting Inputs

## Setting Dates

In [5]:
# date format yyyy-mm-dd

# the date from which we have data
known_start_dt = '2018-01-01' # decide
# the date till which we have data
known_end_dt = '2020-06-03' # decide
unknown_end_dt = '2020-06-30' # decide, generally year end

In [6]:
# original data
df_all = pd.read_csv('https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Model_uncapped.csv')

# changing date to proper format
df_all.date = pd.to_datetime(df_all.date)

# setting date as index
df_all.set_index('date', inplace = True)

In [7]:
# outlier treated data
df_capped = pd.read_csv('https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Model_Final.csv')

# changing date to proper format
df_capped.date = pd.to_datetime(df_capped.date)

# setting date as index
df_capped.set_index('date', inplace = True)

In [8]:
# pdq_visit_unpaid_decided = 'Final_Visits_PDQ.xlsx'
# pdq_leads_decided = 'Final_Leads_PDQ.xlsx'


pdq_visit_unpaid_decided = 'https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Final_Visits_PDQ.csv'
pdq_leads_decided = 'https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Final_Leads_PDQ.csv'


# Functions

In [9]:
#MAPE function
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true))

In [10]:
def alldates(start_dt, end_dt):
    '''
    input in 'yyyy-mm-dd' string format
    '''
    
    start_dt = datetime.strptime(start_dt, '%Y-%m-%d').date()
    end_dt = datetime.strptime(end_dt, '%Y-%m-%d').date()
    
    def daterange(date1, date2):
        for n in range(int ((date2 - date1).days)+1):
            yield date1 + timedelta(n)

    dt_list = list()
    for dt in daterange(start_dt, end_dt):
        #print(dt.strftime("%Y-%m-%d"))
        dt_list.append(dt.strftime("%Y-%m-%d"))
    
    return(dt_list)

## RMSE/MAPE

In [13]:
def rmse_mape_60(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt):
    start_time = time.time()

    # # Setting Dates based on input

    # last two months + some extra days of 3rd month of known data (if any)
    # currently taking as 60 days before known_end_dt 
    known_last2_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 60)).strftime("%Y-%m-%d")


    # this is one day before known_last2_start_dt
    end_dt_before_last2 = (datetime.strptime(known_last2_start_dt, '%Y-%m-%d').date() - timedelta(days = 1)).strftime("%Y-%m-%d")

    # from and to date of range we want to predict
    # generally one day after known_end_dt
    unknown_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() + timedelta(days = 1)).strftime("%Y-%m-%d")


    # last 31 days of known data
    # 31 days before known_end_dt
    last31_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 30)).strftime("%Y-%m-%d")

    
    # subsetting df_all for only the given nameplate
    df_a = df_all[df_all.Category == nameplate]
    df_c = df_capped[df_capped.Category == nameplate]
    
    
    # train on data till 60 days before last day of known data.
    # forecast on last 60 days of known data and calculate RMSE and MAPE
    
    train_last60_a = df_a[known_start_dt:end_dt_before_last2]
    test_last60_a = df_a[known_last2_start_dt:known_end_dt]
    
    train_last60_c = df_c[known_start_dt:end_dt_before_last2]
    
    
    # var   
    #----------creating pdq = 513 for given varialbe in var 
    try:
        # setting p d q values
        p = int(pdq[0])
        d = int(pdq[1])
        q = int(pdq[2])
        
        # deciding training data
        if if_outlier_treat == 'Y':
            train_last60 = train_last60_c.copy()
        elif if_outlier_treat == 'N':
            train_last60 = train_last60_a.copy()
        else:
            print(sys._getframe(  ).f_code.co_name+':error=if_outlier_treat flag wrong: if_outlier_treat= '+if_outlier_treat)
        
        # setting test data
        test_last60 = test_last60_a.copy()
        
        # ARIMAX model
        ar = ARIMA(endog=train_last60[var], exog=train_last60[exog_var], order=(p,d,q))
        # fit ARIMA model
        model = ar.fit()
        # predict test using ARIMA model
        pred_test_last60 = model.forecast(steps=len(test_last60), exog=test_last60[exog_var])
        
        # calculating MAPE & RMSE
        # also handling infinite MAPE incase of leads_total as it has 0 values too
        #if var == 'leads_total':
            ####################################################
            # handling inf in MAPE by removing actual total_leads == 0
        df_mape_inf = pd.DataFrame()
        df_mape_inf[var] = test_last60[var]
        df_mape_inf['preds'] = pred_test_last60[0]
        df_mape_inf = df_mape_inf[df_mape_inf[var] != 0]
            #############################################
        rmse = round(np.sqrt(mean_squared_error(test_last60[var], pred_test_last60[0])), 0)
        mape = round(mean_absolute_percentage_error(df_mape_inf[var], df_mape_inf['preds']), 2)
        #else:
            # calculating MAPE & RMSE
            #rmse = round(np.sqrt(mean_squared_error(test_last60[var], pred_test_last60[0])), 0)
            #mape = round(mean_absolute_percentage_error(test_last60[var], pred_test_last60[0]), 2)
        
    except Exception as e:
        print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':error='+str(e))
        rmse = 'error'
        mape = 'error'
    
    end_time = time.time()
    print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':Time Taken={0} seconds'.format(end_time - start_time))
    
    return rmse, mape

### RMSE/MAPE for 511 and no outlier treatment

In [14]:
start_time = time.time()


df_rmse_mape = pd.DataFrame(columns = ['Nameplate','Variable','pdq','if_outlier_treat','RMSE','MAPE'])



for nameplate in tqdm(df_all.Category.unique()):

        
    for var in ['visits','testdrive','leads']:
        
        if var in ['visits','testdrive']:
            exog_var = 'total_spend_nobrand_nodig'            
        elif var == 'leads':
            exog_var == 'total_spend_nobrand'
        else:
            print('error: wrong variable: var='+var)
            
        pdq = '511'
        if_outlier_treat = 'N'
    
        rmse, mape = rmse_mape_60(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt)
        df_rmse_mape = df_rmse_mape.append({'Nameplate':nameplate,
                                            'Variable':var,
                                            'pdq':pdq,
                                            'if_outlier_treat':if_outlier_treat,
                                            'RMSE':rmse,
                                            'MAPE':mape}, ignore_index = True)
    
    
end_time = time.time()
print("Time Taken RMSE/MAPE: {0} seconds".format(end_time - start_time))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))



rmse_mape_60:nameplate=Cat1:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=4.046333312988281 seconds




rmse_mape_60:nameplate=Cat1:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.395888090133667 seconds




rmse_mape_60:nameplate=Cat1:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.7651863098144531 seconds




rmse_mape_60:nameplate=Cat2:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.6667945384979248 seconds




rmse_mape_60:nameplate=Cat2:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.8452863693237305 seconds




rmse_mape_60:nameplate=Cat2:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.6139414310455322 seconds




rmse_mape_60:nameplate=Cat3:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.091413974761963 seconds




rmse_mape_60:nameplate=Cat3:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.5643813610076904 seconds




rmse_mape_60:nameplate=Cat3:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5427820682525635 seconds




rmse_mape_60:nameplate=Cat4:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.9324843883514404 seconds
rmse_mape_60:nameplate=Cat4:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.774078130722046 seconds




rmse_mape_60:nameplate=Cat4:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.2879483699798584 seconds




rmse_mape_60:nameplate=Cat5:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.7156052589416504 seconds




rmse_mape_60:nameplate=Cat5:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.5589556694030762 seconds




rmse_mape_60:nameplate=Cat5:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5667691230773926 seconds
rmse_mape_60:nameplate=Cat6:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.9462826251983643 seconds




rmse_mape_60:nameplate=Cat6:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=2.1375975608825684 seconds




rmse_mape_60:nameplate=Cat6:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=6.079768657684326 seconds




rmse_mape_60:nameplate=Cat7:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.3442418575286865 seconds




rmse_mape_60:nameplate=Cat7:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.3112952709197998 seconds




rmse_mape_60:nameplate=Cat7:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.098459243774414 seconds




rmse_mape_60:nameplate=Cat8:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.5222930908203125 seconds
rmse_mape_60:nameplate=Cat8:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.5970020294189453 seconds




rmse_mape_60:nameplate=Cat8:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5689678192138672 seconds
rmse_mape_60:nameplate=Cat9:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.8727695941925049 seconds




rmse_mape_60:nameplate=Cat9:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=3.301668643951416 seconds




rmse_mape_60:nameplate=Cat9:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5679769515991211 seconds




rmse_mape_60:nameplate=Cat10:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.7348644733428955 seconds
rmse_mape_60:nameplate=Cat10:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.7869482040405273 seconds




rmse_mape_60:nameplate=Cat10:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.0849981307983398 seconds




rmse_mape_60:nameplate=Cat11:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.851475954055786 seconds




rmse_mape_60:nameplate=Cat11:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.4829566478729248 seconds




rmse_mape_60:nameplate=Cat11:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.8433663845062256 seconds

Time Taken RMSE/MAPE: 52.77849555015564 seconds


In [15]:
df_rmse_mape.to_csv('MAPE_RMSE_unpaid_511_noOUTLIERtreat.csv', index = False)

### RMSE/MAPE for suggested pdqs and outlier treatment

In [16]:
df_all.Category.unique()

array(['Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8',
       'Cat9', 'Cat10', 'Cat11'], dtype=object)

In [17]:
start_time = time.time()

# reading finally decided pdq and outlier treat
#pdq_map_vu = pd.read_excel(pdq_visit_unpaid_decided)
pdq_map_vu = pd.read_csv(pdq_visit_unpaid_decided)
pdq_map_vu['Nameplate'] = pdq_map_vu['Nameplate'].astype(str)
#pdq_map_l = pd.read_excel(pdq_leads_decided)
pdq_map_l = pd.read_csv(pdq_leads_decided)
pdq_map_l['Nameplate'] = pdq_map_l['Nameplate'].astype(str)


df_rmse_mape = pd.DataFrame(columns = ['Nameplate','Variable','pdq','if_outlier_treat','RMSE','MAPE'])



for nameplate in tqdm(df_all.Category.unique()):
#for nameplate in ['WRANGLER']:
        
    for var in ['visits','testdrive','leads']:
        
        if var in ['visits','testdrive','leads']:
            exog_var = 'total_spend_nobrand_nodig'
            
            pdq_map_vu_nameplate = pdq_map_vu[pdq_map_vu.Nameplate == nameplate]
            p = str(pdq_map_vu_nameplate['p'].values[0])
            d = str(pdq_map_vu_nameplate['d'].values[0])
            q = str(pdq_map_vu_nameplate['q'].values[0])
            pdq = p+d+q
            if_outlier_treat = str(pdq_map_vu_nameplate['outlier_treatment'].values[0])
            
        elif var == 'leads':
            exog_var == 'total_spend_nobrand'
            
            pdq_map_l_nameplate = pdq_map_l[pdq_map_l.Nameplate == nameplate]
            p = str(pdq_map_l_nameplate['p'].values[0])
            d = str(pdq_map_l_nameplate['d'].values[0])
            q = str(pdq_map_l_nameplate['q'].values[0])
            pdq = p+d+q
            if_outlier_treat = str(pdq_map_l_nameplate['outlier_treatment'].values[0])
            
        else:
            print('error: wrong variable: var='+var)
            
    
        rmse, mape = rmse_mape_60(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt)
        df_rmse_mape = df_rmse_mape.append({'Nameplate':nameplate,
                                            'Variable':var,
                                            'pdq':pdq,
                                            'if_outlier_treat':if_outlier_treat,
                                            'RMSE':rmse,
                                            'MAPE':mape}, ignore_index = True)
    
    
end_time = time.time()
print("Time Taken RMSE/MAPE: {0} seconds".format(end_time - start_time))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))



rmse_mape_60:nameplate=Cat1:variable=visits:pdq=514:if_outlier_treat=N:error=The computed initial AR coefficients are not stationary
You should induce stationarity, choose a different model order, or you can
pass your own start_params.
rmse_mape_60:nameplate=Cat1:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=0.07643985748291016 seconds




rmse_mape_60:nameplate=Cat1:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=13.551164388656616 seconds




rmse_mape_60:nameplate=Cat1:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=1.2453625202178955 seconds
rmse_mape_60:nameplate=Cat2:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.70414400100708 seconds




rmse_mape_60:nameplate=Cat2:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.8087983131408691 seconds




rmse_mape_60:nameplate=Cat2:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.6335155963897705 seconds




rmse_mape_60:nameplate=Cat3:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.106642484664917 seconds




rmse_mape_60:nameplate=Cat3:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.5676822662353516 seconds




rmse_mape_60:nameplate=Cat3:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5710711479187012 seconds




rmse_mape_60:nameplate=Cat4:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.943845510482788 seconds
rmse_mape_60:nameplate=Cat4:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.774418830871582 seconds




rmse_mape_60:nameplate=Cat4:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.2953507900238037 seconds




rmse_mape_60:nameplate=Cat5:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=10.875273704528809 seconds
rmse_mape_60:nameplate=Cat5:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=8.954253196716309 seconds




rmse_mape_60:nameplate=Cat5:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=1.3743484020233154 seconds
rmse_mape_60:nameplate=Cat6:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.9471840858459473 seconds




rmse_mape_60:nameplate=Cat6:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=2.142988443374634 seconds




rmse_mape_60:nameplate=Cat6:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=6.113490343093872 seconds




rmse_mape_60:nameplate=Cat7:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.3554513454437256 seconds




rmse_mape_60:nameplate=Cat7:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.2950642108917236 seconds




rmse_mape_60:nameplate=Cat7:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.0907342433929443 seconds


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


rmse_mape_60:nameplate=Cat8:variable=visits:pdq=313:if_outlier_treat=N:Time Taken=2.7632904052734375 seconds




rmse_mape_60:nameplate=Cat8:variable=testdrive:pdq=313:if_outlier_treat=N:Time Taken=0.7830207347869873 seconds


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


rmse_mape_60:nameplate=Cat8:variable=leads:pdq=313:if_outlier_treat=N:Time Taken=1.8505322933197021 seconds
rmse_mape_60:nameplate=Cat9:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.8736495971679688 seconds




rmse_mape_60:nameplate=Cat9:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=3.5172784328460693 seconds




rmse_mape_60:nameplate=Cat9:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5679292678833008 seconds




rmse_mape_60:nameplate=Cat10:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.7633087635040283 seconds
rmse_mape_60:nameplate=Cat10:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.7731955051422119 seconds




rmse_mape_60:nameplate=Cat10:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.07965087890625 seconds




rmse_mape_60:nameplate=Cat11:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=12.156602382659912 seconds


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


rmse_mape_60:nameplate=Cat11:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=15.434316873550415 seconds
rmse_mape_60:nameplate=Cat11:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=1.5621440410614014 seconds

Time Taken RMSE/MAPE: 105.21390533447266 seconds




In [18]:
df_rmse_mape.to_csv('MAPE_RMSE_unpaid_suggested.csv', index = False)

# Forecast

In [20]:
def forecast_func_autoarima(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt):
    
    start_time = time.time()


    # # Setting Dates based on input

    # last two months + some extra days of 3rd month of known data (if any)
    # currently taking as 60 days before known_end_dt 
    known_last2_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 60)).strftime("%Y-%m-%d")


    # this is one day before known_last2_start_dt
    end_dt_before_last2 = (datetime.strptime(known_last2_start_dt, '%Y-%m-%d').date() - timedelta(days = 1)).strftime("%Y-%m-%d")

    # from and to date of range we want to predict
    # generally one day after known_end_dt
    unknown_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() + timedelta(days = 1)).strftime("%Y-%m-%d")


    # last 31 days of known data
    # 31 days before known_end_dt
    last31_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 30)).strftime("%Y-%m-%d")
    
    # subsetting df_all for only the given nameplate
    df_a = df_all[df_all.Category == nameplate]
    df_c = df_capped[df_capped.Category == nameplate]
    
    
    
    # train on data till 60 days before last day of known data.
    # forecast on last 60 days of known data and calculate RMSE and MAPE
    
    train_all_known_a = df_a[known_start_dt:known_end_dt]
    test_all_unknown_a = df_a[unknown_start_dt:unknown_end_dt]
    
    train_all_known_c = df_c[known_start_dt:known_end_dt]
    
    
    # ARIMAX Forecasting
    
    #----------creating pdq = decided pdq
    try:
        # setting p d q values
        p = int(pdq[0])
        d = int(pdq[1])
        q = int(pdq[2])
        
        # deciding training data
        if if_outlier_treat == 'Y':
            train_all_known = train_all_known_c.copy()
        elif if_outlier_treat == 'N':
            train_all_known = train_all_known_a.copy()
        else:
            print(sys._getframe(  ).f_code.co_name+':error=if_outlier_treat flag wrong: if_outlier_treat= '+if_outlier_treat)
        
        # setting test data    
        test_all_unknown = test_all_unknown_a.copy()

        ar = ARIMA(endog=train_all_known[var], exog=train_all_known[exog_var], order=(p,d,q))
        # fit ARIMA model
        model = ar.fit()
        # predict test using ARIMA model
        pred_unknown = model.forecast(steps=len(test_all_unknown), exog=test_all_unknown[exog_var])
        
        # this is because we want actual data in forecasts and not outlier capped data
        train_all_known = train_all_known_a.copy()
        # taking out the forecast in csv file
        df_actual = pd.DataFrame()
        df_actual['date'] = pd.Series(train_all_known.index)
        df_actual[var+'_actual'] = list(train_all_known[var])
        
        df_pred = pd.DataFrame()
        df_pred['date'] = alldates(unknown_start_dt,unknown_end_dt)
        df_pred.date = pd.to_datetime(df_pred.date)
        df_pred[var+'_forecast'] = pd.DataFrame(pred_unknown[0], columns = [var])
        
        ar_forecast = pd.DataFrame()
        ar_forecast = df_actual.append(df_pred)

    except Exception as e:
        # handling if error in given pdq
        ar_forecast = pd.DataFrame(columns = ['date',var+'_actual',var+'_forecast'])
        
        print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':error='+str(e))
      

    
    
    
    end_time = time.time()
    print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':Time Taken={0} seconds'.format(end_time - start_time))
    
    return ar_forecast

In [21]:
def forecast_func(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt):
    
    start_time = time.time()


    # # Setting Dates based on input

    # last two months + some extra days of 3rd month of known data (if any)
    # currently taking as 60 days before known_end_dt 
    known_last2_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 60)).strftime("%Y-%m-%d")


    # this is one day before known_last2_start_dt
    end_dt_before_last2 = (datetime.strptime(known_last2_start_dt, '%Y-%m-%d').date() - timedelta(days = 1)).strftime("%Y-%m-%d")

    # from and to date of range we want to predict
    # generally one day after known_end_dt
    unknown_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() + timedelta(days = 1)).strftime("%Y-%m-%d")


    # last 31 days of known data
    # 31 days before known_end_dt
    last31_start_dt = (datetime.strptime(known_end_dt, '%Y-%m-%d').date() - timedelta(days = 30)).strftime("%Y-%m-%d")
    
    # subsetting df_all for only the given nameplate
    df_a = df_all[df_all.Category == nameplate]
    df_c = df_capped[df_capped.Category == nameplate]
    
    
    
    # train on data till 60 days before last day of known data.
    # forecast on last 60 days of known data and calculate RMSE and MAPE
    
    train_all_known_a = df_a[known_start_dt:known_end_dt]
    test_all_unknown_a = df_a[unknown_start_dt:unknown_end_dt]
    
    train_all_known_c = df_c[known_start_dt:known_end_dt]
    
    
    # ARIMAX Forecasting
    
    #----------creating pdq = decided pdq
    try:
        # setting p d q values
        p = int(pdq[0])
        d = int(pdq[1])
        q = int(pdq[2])
        
        # deciding training data
        if if_outlier_treat == 'Y':
            train_all_known = train_all_known_c.copy()
        elif if_outlier_treat == 'N':
            train_all_known = train_all_known_a.copy()
        else:
            print(sys._getframe(  ).f_code.co_name+':error=if_outlier_treat flag wrong: if_outlier_treat= '+if_outlier_treat)
        
        # setting test data    
        test_all_unknown = test_all_unknown_a.copy()

        ar = ARIMA(endog=train_all_known[var], exog=train_all_known[exog_var], order=(p,d,q))
        # fit ARIMA model
        model = ar.fit()
        # predict test using ARIMA model
        pred_unknown = model.forecast(steps=len(test_all_unknown), exog=test_all_unknown[exog_var])
        
        # this is because we want actual data in forecasts and not outlier capped data
        train_all_known = train_all_known_a.copy()
        # taking out the forecast in csv file
        df_actual = pd.DataFrame()
        df_actual['date'] = pd.Series(train_all_known.index)
        df_actual[var+'_actual'] = list(train_all_known[var])
        
        df_pred = pd.DataFrame()
        df_pred['date'] = alldates(unknown_start_dt,unknown_end_dt)
        df_pred.date = pd.to_datetime(df_pred.date)
        df_pred[var+'_forecast'] = pd.DataFrame(pred_unknown[0], columns = [var])
        
        ar_forecast = pd.DataFrame()
        ar_forecast = df_actual.append(df_pred)

    except Exception as e:
        # handling if error in given pdq
        ar_forecast = pd.DataFrame(columns = ['date',var+'_actual',var+'_forecast'])
        
        print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':error='+str(e))
      

    
    
    
    end_time = time.time()
    print(sys._getframe(  ).f_code.co_name+':nameplate='+nameplate+':variable='+var+':pdq='+pdq+':if_outlier_treat='+if_outlier_treat+':Time Taken={0} seconds'.format(end_time - start_time))
    
    return ar_forecast

### Forecast for 511 and no outlier treatment

In [22]:
start_time = time.time()

# creating dataframe to hold forecasts
visit_unpaid_forecast = pd.DataFrame()
lfa_unpaid_forecast = pd.DataFrame()
leads_total_forecast = pd.DataFrame()

exog_var = 'total_spend_nobrand_dig'

for nameplate in tqdm(df_all.Category.unique()):
#for nameplate in ['GLADIATOR']:
    for var in ['visits','testdrive','leads']:
        
        if var in ['visits','testdrive','leads']:
            exog_var = 'total_spend_nobrand_nodig'            
        elif var == 'leads':
            exog_var == 'total_spend_nobrand'
        else:
            print('error: wrong variable: var='+var)
            
        pdq = '511'
        if_outlier_treat = 'N'
            
        df_forecast = forecast_func(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt)
        
        df_forecast['Nameplate'] = nameplate
        
        if var == 'visits':
            visit_unpaid_forecast = visit_unpaid_forecast.append(df_forecast)
        elif var == 'testdrive':
            lfa_unpaid_forecast = lfa_unpaid_forecast.append(df_forecast)
        elif var == 'leads':
            leads_total_forecast = leads_total_forecast.append(df_forecast)
        else:
            print('error: wrong forecast variable='+var)
    
    
end_time = time.time()
print("Time Taken RMSE/MAPE: {0} seconds".format(end_time - start_time))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))



forecast_func:nameplate=Cat1:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.708144187927246 seconds




forecast_func:nameplate=Cat1:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.6546645164489746 seconds




forecast_func:nameplate=Cat1:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.521517276763916 seconds
forecast_func:nameplate=Cat2:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.323678493499756 seconds




forecast_func:nameplate=Cat2:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=2.7726967334747314 seconds




forecast_func:nameplate=Cat2:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.7362511157989502 seconds




forecast_func:nameplate=Cat3:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=5.706800699234009 seconds




forecast_func:nameplate=Cat3:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.6643843650817871 seconds




forecast_func:nameplate=Cat3:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5927474498748779 seconds




forecast_func:nameplate=Cat4:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=5.387563467025757 seconds




forecast_func:nameplate=Cat4:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=5.253355979919434 seconds




forecast_func:nameplate=Cat4:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.6884431838989258 seconds




forecast_func:nameplate=Cat5:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.4429075717926025 seconds




forecast_func:nameplate=Cat5:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=2.373345136642456 seconds




forecast_func:nameplate=Cat5:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=2.0987486839294434 seconds




forecast_func:nameplate=Cat6:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.861351728439331 seconds




forecast_func:nameplate=Cat6:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=4.5259034633636475 seconds




forecast_func:nameplate=Cat6:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=4.360277891159058 seconds




forecast_func:nameplate=Cat7:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.5391030311584473 seconds




forecast_func:nameplate=Cat7:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.5955054759979248 seconds




forecast_func:nameplate=Cat7:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.296832799911499 seconds




forecast_func:nameplate=Cat8:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.630885362625122 seconds




forecast_func:nameplate=Cat8:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.9270370006561279 seconds




forecast_func:nameplate=Cat8:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.1154005527496338 seconds
forecast_func:nameplate=Cat9:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.360532522201538 seconds




forecast_func:nameplate=Cat9:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.1917309761047363 seconds




forecast_func:nameplate=Cat9:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5426619052886963 seconds




forecast_func:nameplate=Cat10:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.6760270595550537 seconds




forecast_func:nameplate=Cat10:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.0900819301605225 seconds
forecast_func:nameplate=Cat10:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.4695611000061035 seconds




forecast_func:nameplate=Cat11:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.3748743534088135 seconds




forecast_func:nameplate=Cat11:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=3.4548227787017822 seconds




forecast_func:nameplate=Cat11:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.8799750804901123 seconds

Time Taken RMSE/MAPE: 66.01907777786255 seconds


In [23]:
visit_unpaid_forecast.to_csv('visit_unpaid_forecast_511_noOUTLIERtreat.csv', index = False)
lfa_unpaid_forecast.to_csv('lfa_unpaid_forecast_511_noOUTLIERtreat.csv', index = False)
leads_total_forecast.to_csv('leads_total_forecast_511_noOUTLIERtreat.csv', index = False)

### Forecast for suggested pdqs and outlier treatment

In [24]:
start_time = time.time()

# reading finally decided pdq and outlier treat
#pdq_map_vu = pd.read_excel(pdq_visit_unpaid_decided)
pdq_map_vu = pd.read_csv(pdq_visit_unpaid_decided)


pdq_map_vu['Nameplate'] = pdq_map_vu['Nameplate'].astype(str)
#pdq_map_l = pd.read_excel(pdq_leads_decided)
pdq_map_l = pd.read_csv(pdq_leads_decided)
pdq_map_l['Nameplate'] = pdq_map_l['Nameplate'].astype(str)

# creating dataframe to hold forecasts
visit_unpaid_forecast = pd.DataFrame()
lfa_unpaid_forecast = pd.DataFrame()
leads_total_forecast = pd.DataFrame()

# reading finally decided pdq and outlier treat
#pdq_map_vu = pd.read_excel('Final_Visits_PDQ.xlsx')
pdq_map_vu = pd.read_csv("https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Final_Visits_PDQ.csv")
pdq_map_vu['Nameplate'] = pdq_map_vu['Nameplate'].astype(str)

#pdq_map_l = pd.read_excel('Final_Leads_PDQ.xlsx')
pdq_map_l = pd.read_csv("https://raw.githubusercontent.com/datacoe-publicissapient/risingai2020/master/data/Final_Leads_PDQ.csv")
pdq_map_l['Nameplate'] = pdq_map_l['Nameplate'].astype(str)

exog_var = 'total_spend_nobrand_dig'

for nameplate in tqdm(df_all.Category.unique()):
#for nameplate in ['CHARGER']:
        
    for var in ['visits','testdrive','leads']:
        
        if var in ['visits','testdrive','leads']:
            exog_var = 'total_spend_nobrand_nodig'
            
            pdq_map_vu_nameplate = pdq_map_vu[pdq_map_vu.Nameplate == nameplate]
            p = str(pdq_map_vu_nameplate['p'].values[0])
            d = str(pdq_map_vu_nameplate['d'].values[0])
            q = str(pdq_map_vu_nameplate['q'].values[0])
            pdq = p+d+q
            if_outlier_treat = str(pdq_map_vu_nameplate['outlier_treatment'].values[0])
            
        elif var == 'leads':
            exog_var == 'total_spend_nobrand'
            
            pdq_map_l_nameplate = pdq_map_l[pdq_map_l.Nameplate == nameplate]
            p = str(pdq_map_l_nameplate['p'].values[0])
            d = str(pdq_map_l_nameplate['d'].values[0])
            q = str(pdq_map_l_nameplate['q'].values[0])
            pdq = p+d+q
            if_outlier_treat = str(pdq_map_l_nameplate['outlier_treatment'].values[0])
            
        else:
            print('error: wrong variable: var='+var)
            
            
        df_forecast = forecast_func(df_all,df_capped,pdq,var,exog_var,if_outlier_treat,nameplate,known_start_dt,known_end_dt,unknown_end_dt)
        
        df_forecast['Nameplate'] = nameplate
        
        if var == 'visit_unpaid':
            visit_unpaid_forecast = visit_unpaid_forecast.append(df_forecast)
        elif var == 'lfa_unpaid':
            lfa_unpaid_forecast = lfa_unpaid_forecast.append(df_forecast)
        elif var == 'leads_total':
            leads_total_forecast = leads_total_forecast.append(df_forecast)
        else:
            print('error: wrong forecast variable='+var)
    
    
end_time = time.time()
print("Time Taken RMSE/MAPE: {0} seconds".format(end_time - start_time))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))



forecast_func:nameplate=Cat1:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=10.537022352218628 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat1:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=11.442726850509644 seconds
error: wrong forecast variable=testdrive


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


forecast_func:nameplate=Cat1:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=5.574375867843628 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat2:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.288087844848633 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat2:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=2.8347485065460205 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat2:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.6919100284576416 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat3:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=5.706094264984131 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat3:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=0.6715395450592041 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat3:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.5818886756896973 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat4:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=5.415723562240601 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat4:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=5.201598644256592 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat4:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.7098116874694824 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat5:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=10.788244247436523 seconds
error: wrong forecast variable=visits


  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  newparams = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()
  tmp = ((1-np.exp(-params))/(1+np.exp(-params))).copy()


forecast_func:nameplate=Cat5:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=14.219872951507568 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat5:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=1.0972590446472168 seconds
error: wrong forecast variable=leads
forecast_func:nameplate=Cat6:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.861407995223999 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat6:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=4.547550439834595 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat6:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=4.3671886920928955 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat7:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=2.5431675910949707 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat7:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.5811429023742676 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat7:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.2889268398284912 seconds
error: wrong forecast variable=leads
forecast_func:nameplate=Cat8:variable=visits:pdq=313:if_outlier_treat=N:error=The computed initial AR coefficients are not stationary
You should induce stationarity, choose a different model order, or you can
pass your own start_params.
forecast_func:nameplate=Cat8:variable=visits:pdq=313:if_outlier_treat=N:Time Taken=0.09562230110168457 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat8:variable=testdrive:pdq=313:if_outlier_treat=N:Time Taken=0.7896945476531982 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat8:variable=leads:pdq=313:if_outlier_treat=N:Time Taken=0.5361120700836182 seconds
error: wrong forecast variable=leads
forecast_func:nameplate=Cat9:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=1.3493399620056152 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat9:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.1605279445648193 seconds
error: wrong forecast variable=testdrive




forecast_func:nameplate=Cat9:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=0.534508466720581 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat10:variable=visits:pdq=511:if_outlier_treat=N:Time Taken=0.6658473014831543 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat10:variable=testdrive:pdq=511:if_outlier_treat=N:Time Taken=1.0870561599731445 seconds
error: wrong forecast variable=testdrive
forecast_func:nameplate=Cat10:variable=leads:pdq=511:if_outlier_treat=N:Time Taken=1.4811415672302246 seconds
error: wrong forecast variable=leads




forecast_func:nameplate=Cat11:variable=visits:pdq=514:if_outlier_treat=N:Time Taken=15.167641162872314 seconds
error: wrong forecast variable=visits




forecast_func:nameplate=Cat11:variable=testdrive:pdq=514:if_outlier_treat=N:Time Taken=5.602934837341309 seconds
error: wrong forecast variable=testdrive
forecast_func:nameplate=Cat11:variable=leads:pdq=514:if_outlier_treat=N:Time Taken=1.0363869667053223 seconds
error: wrong forecast variable=leads

Time Taken RMSE/MAPE: 122.90925550460815 seconds




In [25]:
visit_unpaid_forecast.to_csv('visit_unpaid_forecast_suggested.csv', index = False)
lfa_unpaid_forecast.to_csv('lfa_unpaid_forecast_suggested.csv', index = False)
leads_total_forecast.to_csv('leads_total_forecast_suggested.csv', index = False)

In [28]:
files.download('leads_total_forecast_suggested.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>