- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
import collections.abc
#hyper needs the four following aliases to be done manually.
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor

In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n!pip install pmdarima\n!pip install plotly==5.11.0\n!pip install scikit-hts[auto-arima]\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
import collections.abc
#hyper needs the four following aliases to be done manually.
collections.Iterable = collections.abc.Iterable
collections.Mapping = collections.abc.Mapping
collections.MutableSet = collections.abc.MutableSet
collections.MutableMapping = collections.abc.MutableMapping
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor

- DEFINE YOUR CLASSES AND FUNCTIONS 
-----------------------------------
This is not required, but is helpful in keeping your notebook organized. 
You can use the following cell or several cells to define your functions
and classes to keep them separate from your analysis or results code.
In general it useful to define your methods in a separate cell from where
it is run.

In [None]:
def dataExplore(data):
    '''
    Explore dataframe
    '''
    print("# of observations: ", data.shape[0])
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            print("# of %s: %s" % (col, len(pd.unique(data[col]))))
        else:
            print("Unique value of %s: %s" % (col, pd.unique(data[col])))

In [None]:
def dataBalanceCheck(data):
    '''
    Check the balance of data frame
    '''
    unbalance_count = 0
    print("# of observations in complete time series: ", len(pd.unique(data["week_number"])))
    for id in pd.unique(data["uu_id"]):
        if len(data[data["uu_id"] == id]) <  len(pd.unique(data["week_number"])):
            print(id, len(data[data["uu_id"] == id]))
            unbalance_count += 1
    print("% of tracts with incomplete time series: ", unbalance_count / len(pd.unique(data["uu_id"]))*100)

In [None]:
def dataFillNa(data, value):
    """
   fill NA with given value in the dataframe
    """
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            pass
        elif col in ["top_category_employer1", "top_category_employer2", "top_category_employer3"]:
            data[col] = data[col].replace({'N/A':str(value)})
        else:
            data[col] = data[col].fillna(value)
    return(data)

In [None]:
def dataIdentifyDWM(data):
    '''
    Input: # of week. Output: data for the first day, its month and week order in the month
    '''
    data["date"] = pd.to_datetime(2022 * 1000 + (1+(data["week_number"]-1)*7), format='%Y%j')
    data["month"] = pd.DatetimeIndex(data["date"]).month
    data["weekofmonth"]= pd.to_numeric(data["date"].dt.day/7)
    data['weekofmonth'] = data['weekofmonth'].apply(lambda x: math.ceil(x))
    return(data)

In [None]:
def MSPE(s1, s2):
    return(sum((s1 - s2)**2)/len(s1))

In [None]:
def MAPE(s1, s2):
    return(sum(abs(s1 - s2))/len(s1))

In [None]:
def ARIMA_predict(df_input, cutoff_rate = 0.8, n_period = 15):
    cutoff = int(cutoff_rate * len(df_input))
    if cutoff_rate < 1:
        valid = df_input[cutoff:]
    train = df_input[:cutoff]
    model = auto_arima(train, trace=False, error_action='ignore', suppress_warnings=True)
    model.fit(train)
    forecast = model.predict(n_period)
    return(forecast)

In [None]:
# Obtain data using BigQuery
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query = """
SELECT
a.*,
b.average_wage
FROM 
(SELECT 
*
FROM `ironhacks-data.ironhacks_competition.unemployment_data`) a
JOIN `ironhacks-data.ironhacks_competition.wage_data` b 
ON a.uu_id=b.uu_id
"""

In [None]:
query_job = bigquery_client.query(query)
data = query_job.to_dataframe()

In [None]:
query_pred = """
SELECT * FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
query_job_pred = bigquery_client.query(query_pred)
data_pred_query= query_job_pred.to_dataframe()

In [None]:
# Explore input data for NA and special values
# dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()
data.head()

In [None]:
# Further check tracts with average_wage as Nan
# I find three tracts with all average_wage as Nan. If I drop these tracts due to Nan value, they cannot be predicted
for id in pd.unique(data[data['average_wage'].isna()]["uu_id"]):
    print(id)
    print("All value are nan?", data[data['uu_id'] == id]["average_wage"].isnull().all())
    print("Included in prediction list?", len(data_pred_query[data_pred_query['uu_id'] == id]) > 0)

In [None]:
# Explore input data for NA and special values
# dataExplore(data)
# dataExplore(data_pred_query)
data_pred_query.head()
# data.head()

In [None]:
# Further check tracts with average_wage as Nan
# I find three tracts with all average_wage as Nan. If I drop these tracts due to Nan value, they cannot be predicted
for id in pd.unique(data[data['average_wage'].isna()]["uu_id"]):
    print(id)
    print("All value are nan?", data[data['uu_id'] == id]["average_wage"].isnull().all())
    print("Included in prediction list?", len(data_pred_query[data_pred_query['uu_id'] == id]) > 0)

In [None]:
# Explore input data for NA and special values
dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()
# data.head()

Explore input data for NA and special values
dataExplore(data)
dataExplore(data_pred_query)
data_pred_query.head()
data.head()

In [None]:
# Backup the data before pre-treatment
data_backup = data.copy()
data_pred_query_backup = data_pred_query.copy()

In [None]:
# Pretreatment: convert week_number to month and week of month, to capture seasonality
data = dataIdentifyDWM(data)

In [None]:
# Check if the dataset is a balance panel (all tracts have value for all time periods)
# 54% of tracts has less than 35 observations (total number of full time series), indicating it is unbalanced
# Even if only checking data afer 2022/6/1, there are still 36% of tracts with incomplete series
dataBalanceCheck(data)
dataBalanceCheck(data[data["date"] > "2022-06-01"])

In [None]:
# To balance the dataset as panel data
data_balance = data.set_index('week_number')
data_balance = data_balance.sort_index(ascending=False)
data_balance = data_balance.set_index('uu_id',append=True)
data_balance = data_balance[~data_balance.index.duplicated(keep='first')]

In [None]:
data_balance = data_balance.reset_index(level=['week_number'])
data_balance = (data_balance.set_index('week_number',append=True).reindex(pd.MultiIndex.from_product([data_balance.index.unique(),
                                                      range(data_balance.week_number.min(),data_balance.week_number.max()+1)],
                                                     names=['uu_id','week_number'])).reset_index(level=1))

In [None]:
data_balance = data_balance.set_index('week_number',append=True)
data_balance['total_claims'] = data_balance['total_claims'].fillna(0)
data_balance['average_wage'] = data_balance['average_wage'].interpolate(method = "linear")

In [None]:
data_balance = data_balance.reset_index(level=['uu_id', "week_number"])
data_balance = dataIdentifyDWM(data_balance)

In [None]:
dataBalanceCheck(data_balance)

In [None]:
# Data clean up: convert NA to 0 for gender, race, education and top employer and recalculate unknown category
# Based on the check of Nan in average_wage above, I also convert Nan to zero as well, but try models with and without "average_wage" variable
data = dataFillNa(data, 0)

In [None]:
# Split data to training and validaton sets
# Max trainweek is 37, use a 80 / 20 rule
train_week = int(max(pd.unique(data["week_number"]))*0.8)

In [None]:
data_train = data[data["week_number"] < train_week]
data_valid = data[data["week_number"] >= train_week]

In [None]:
data_train_x = data_train.drop("total_claims",1)
data_train_y = data_train["total_claims"]

In [None]:
data_valid_x = data_valid.drop("total_claims",1)
data_valid_y = data_valid["total_claims"]

Also prepare it for balanced dataset

In [None]:
data_balance_train = data_balance[data_balance["week_number"] < train_week]
data_balance_valid = data_balance[data_balance["week_number"] >= train_week]

In [None]:
data_balance_train_x = data_balance_train.drop("total_claims",1)
data_balance_train_y = data_balance_train["total_claims"]

In [None]:
data_balance_valid_x = data_balance_valid.drop("total_claims",1)
data_balance_valid_y = data_balance_valid["total_claims"]

In [None]:
data_balance_valid_y.shape

In [None]:
# Model 1 : Poisson regression with unbalanced data
data_train_x_m1 = data_train_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_train_x_m1["month"] = data_train_x_m1["month"].astype(str)
data_train_x_m1["weekofmonth"] = data_train_x_m1["weekofmonth"].astype(str)
data_train_x_m1["week_number2"] = data_train_x_m1["week_number"]**2
data_train_x_m1 = pd.get_dummies(data_train_x_m1)

In [None]:
data_valid_x_m1 = data_valid_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_valid_x_m1["month"] = data_valid_x_m1["month"].astype(str)
data_valid_x_m1["weekofmonth"] = data_valid_x_m1["weekofmonth"].astype(str)
data_valid_x_m1["week_number2"] = data_valid_x_m1["week_number"]**2
data_valid_x_m1 = pd.get_dummies(data_valid_x_m1)

In [None]:
data_train_x_m1["month_8"] = 0
data_train_x_m1["month_9"] = 0

In [None]:
for i in range(8):
    data_valid_x_m1["month_"+str(1+i)] = 0

In [None]:
for i in range(5):
    if i == 1:
        pass
    data_valid_x_m1["weekofmonth_"+str(1+i)] = 0

In [None]:
poission_model = sm.GLM(data_train_y.astype(int), data_train_x_m1.astype(float), family=sm.families.Poisson())
result = poission_model.fit()
result.summary()

In [None]:
data_estimate_m1 = result.predict(data_valid_x_m1.astype(float))
print("MAPE: ", MAPE(data_estimate_m1, data_valid_y))
print("MSPE: ", MSPE(data_estimate_m1, data_valid_y))

In [None]:
# Model 2: Poisson with balanced data
data_balance_train_x_m1 = data_balance_train_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_balance_train_x_m1["month"] = data_balance_train_x_m1["month"].astype(str)
data_balance_train_x_m1["weekofmonth"] = data_balance_train_x_m1["weekofmonth"].astype(str)
data_balance_train_x_m1["week_number2"] = data_balance_train_x_m1["week_number"]**2
data_balance_train_x_m1 = pd.get_dummies(data_balance_train_x_m1)

In [None]:
data_balance_valid_x_m1 = data_balance_valid_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_balance_valid_x_m1["month"] = data_balance_valid_x_m1["month"].astype(str)
data_balance_valid_x_m1["weekofmonth"] = data_balance_valid_x_m1["weekofmonth"].astype(str)
data_balance_valid_x_m1["week_number2"] = data_balance_valid_x_m1["week_number"]**2
data_balance_valid_x_m1 = pd.get_dummies(data_balance_valid_x_m1)

In [None]:
data_balance_train_x_m1["month_8"] = 0
data_balance_train_x_m1["month_9"] = 0

In [None]:
for i in range(8):
    data_balance_valid_x_m1["month_"+str(1+i)] = 0

In [None]:
for i in range(5):
    if i == 1:
        pass
    data_balance_valid_x_m1["weekofmonth_"+str(1+i)] = 0

In [None]:
poission_model_m2 = sm.GLM(data_balance_train_y.astype(int), data_balance_train_x_m1.astype(float), family=sm.families.Poisson())
result_m2 = poission_model_m2.fit()
result_m2.summary()

In [None]:
data_balance_estimate_m2 = result_m2.predict(data_balance_valid_x_m1.astype(float))
print("MAPE: ", MAPE(data_balance_estimate_m2, data_balance_valid_y))
print("MSPE: ", MSPE(data_balance_estimate_m2, data_balance_valid_y))

In [None]:
# Model 3 time series
# First, visualize average total_claim
data_balance_ave = data_balance[["week_number", "total_claims", "uu_id"]]
data_balance_ave = data_balance_ave.groupby(['week_number']).mean()
data_balance_ave = data_balance_ave.reset_index()
data_balance_ave['MA4'] = data['total_claims'].rolling(4).mean()
fig = px.line(data_balance_ave, x="week_number", y=["total_claims", "MA4"])
fig.show()

In [None]:
# model train and validation
MAPE_list = []
MSPE_list = []

In [None]:
uu_id_list = pd.unique(data_balance["uu_id"])

In [None]:
for i in range(len(uu_id_list)):
    print(i)
    data_balance_tract = data_balance[data_balance["uu_id"] == uu_id_list[i]]
    data_balance_tract_model = data_balance_tract[["week_number","total_claims"]]
    data_balance_tract_model = data_balance_tract_model.set_index("week_number")
    forecast = ARIMA_predict(data_balance_tract_model, cutoff_rate = 0.8, n_period = 15)
    
    df_forecast = pd.DataFrame(forecast)
    df_forecast.index.name = "week_number"
    df_forecast.columns = ["total_claim_pred"]
    
    data_balance_ave_valid_check = data_balance_ave_valid.merge(df_forecast, on = "week_number")
    MAPE_series = MAPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    MSPE_series = MSPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    
    MAPE_list.append(MAPE_series)
    MSPE_list.append(MSPE_series)

In [None]:
# model train and validation
MAPE_list = []
MSPE_list = []

In [None]:
uu_id_list = pd.unique(data_balance["uu_id"])

In [None]:
for i in range(len(uu_id_list)):
    print(i)
    data_balance_tract = data_balance[data_balance["uu_id"] == uu_id_list[i]]
    data_balance_tract_model = data_balance_tract[["week_number","total_claims"]]
    data_balance_tract_model = data_balance_tract_model.set_index("week_number")
    forecast = ARIMA_predict(data_balance_tract_model, cutoff_rate = 0.8, n_period = 15)
    
    df_forecast = pd.DataFrame(forecast)
    df_forecast.index.name = "week_number"
    df_forecast.columns = ["total_claim_pred"]
    
    data_balance_ave_valid_check = data_balance_tract_model.merge(df_forecast, on = "week_number")
    MAPE_series = MAPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    MSPE_series = MSPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    
    MAPE_list.append(MAPE_series)
    MSPE_list.append(MSPE_series)
    break

In [None]:
MAPE_list

In [None]:
i = 0
data_balance_tract = data_balance[data_balance["uu_id"] == uu_id_list[i]]
data_balance_tract_model = data_balance_tract[["week_number","total_claims"]]
data_balance_tract_model = data_balance_tract_model.set_index("week_number")
forecast = ARIMA_predict(data_balance_tract_model, cutoff_rate = 0.8, n_period = 15)

In [None]:
df_forecast = pd.DataFrame(forecast)
df_forecast.index.name = "week_number"
df_forecast.columns = ["total_claim_pred"]

In [None]:
data_balance_ave_valid_check = data_balance_tract_model.merge(df_forecast, on = "week_number")
MAPE_series = MAPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
MSPE_series = MSPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])

In [None]:
data_balance_ave_valid_check

In [None]:
# model train and validation
MAPE_list = []
MSPE_list = []

In [None]:
uu_id_list = pd.unique(data_balance["uu_id"])

In [None]:
for i in range(len(uu_id_list)):
    print(i)
    data_balance_tract = data_balance[data_balance["uu_id"] == uu_id_list[i]]
    data_balance_tract_model = data_balance_tract[["week_number","total_claims"]]
    data_balance_tract_model = data_balance_tract_model.set_index("week_number")
    forecast = ARIMA_predict(data_balance_tract_model, cutoff_rate = 0.8, n_period = 15)
    
    df_forecast = pd.DataFrame(forecast)
    df_forecast.index.name = "week_number"
    df_forecast.columns = ["total_claim_pred"]
    
    data_balance_ave_valid_check = data_balance_tract_model.merge(df_forecast, on = "week_number")
    MAPE_series = MAPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    MSPE_series = MSPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    
    MAPE_list.append(MAPE_series)
    MSPE_list.append(MSPE_series)
    break

In [None]:
# model train and validation
MAPE_list = []
MSPE_list = []

In [None]:
uu_id_list = pd.unique(data_balance["uu_id"])

In [None]:
for i in range(len(uu_id_list)):
    print(i)
    data_balance_tract = data_balance[data_balance["uu_id"] == uu_id_list[i]]
    data_balance_tract_model = data_balance_tract[["week_number","total_claims"]]
    data_balance_tract_model = data_balance_tract_model.set_index("week_number")
    forecast = ARIMA_predict(data_balance_tract_model, cutoff_rate = 0.8, n_period = 15)
    
    df_forecast = pd.DataFrame(forecast)
    df_forecast.index.name = "week_number"
    df_forecast.columns = ["total_claim_pred"]
    
    data_balance_ave_valid_check = data_balance_tract_model.merge(df_forecast, on = "week_number")
    MAPE_series = MAPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    MSPE_series = MSPE(data_balance_ave_valid_check["total_claims"], data_balance_ave_valid_check["total_claim_pred"])
    
    MAPE_list.append(MAPE_series)
    MSPE_list.append(MSPE_series)

In [None]:
# validation
print("MAPE: ", sum(MAPE_list)/len(MAPE_list))
print("MSPE: ", sum(MSPE_list)/len(MSPE_list))

In [None]:
data_balance.head()

In [None]:
data_balance.sort_values(by='uu_id',inplace=True)
data_balance["countyfips"].fillna(method='ffill')

In [None]:
data_balance.head()

In [None]:
data_balance.head()

In [None]:
# To balance the dataset as panel data
data_balance = data.set_index('week_number')
data_balance = data_balance.sort_index(ascending=False)
data_balance = data_balance.set_index('uu_id',append=True)
data_balance = data_balance[~data_balance.index.duplicated(keep='first')]

In [None]:
data_balance = data_balance.reset_index(level=['week_number'])
data_balance = (data_balance.set_index('week_number',append=True).reindex(pd.MultiIndex.from_product([data_balance.index.unique(),
                                                      range(data_balance.week_number.min(),data_balance.week_number.max()+1)],
                                                     names=['uu_id','week_number'])).reset_index(level=1))

In [None]:
data_balance = data_balance.set_index('week_number',append=True)
data_balance['total_claims'] = data_balance['total_claims'].fillna(0)
data_balance['average_wage'] = data_balance['average_wage'].interpolate(method = "linear")

In [None]:
data_balance = data_balance.reset_index(level=['uu_id', "week_number"])
data_balance = dataIdentifyDWM(data_balance)

In [None]:
data_balance.head()

In [None]:
data_balance.sort_values(by='uu_id',inplace=True)
data_balance.fillna(method='ffill')

In [None]:
# To balance the dataset as panel data
data_balance = data.set_index('week_number')
data_balance = data_balance.sort_index(ascending=False)
data_balance = data_balance.set_index('uu_id',append=True)
data_balance = data_balance[~data_balance.index.duplicated(keep='first')]

In [None]:
data_balance = data_balance.reset_index(level=['week_number'])
data_balance = (data_balance.set_index('week_number',append=True).reindex(pd.MultiIndex.from_product([data_balance.index.unique(),
                                                      range(data_balance.week_number.min(),data_balance.week_number.max()+1)],
                                                     names=['uu_id','week_number'])).reset_index(level=1))

In [None]:
data_balance = data_balance.set_index('week_number',append=True)
data_balance['total_claims'] = data_balance['total_claims'].fillna(0)
data_balance['average_wage'] = data_balance['average_wage'].interpolate(method = "linear")

In [None]:
data_balance = data_balance.reset_index(level=['uu_id', "week_number"])
data_balance = dataIdentifyDWM(data_balance)

In [None]:
data_balance

In [None]:
data_balance.sort_values(by='uu_id',inplace=True)
data_balance.fillna(method='ffill')

In [None]:
data_balance.sort_values(by='uu_id',inplace=True)
data_balance.fillna(method='ffill')
data_balance.sort_index()

In [None]:
data_balance.sort_values(by='uu_id',inplace=True)
data_balance = data_balance.fillna(method='ffill')
data_balance.sort_index()

In [None]:
# Split data to training and validaton sets
# Max trainweek is 37, use a 80 / 20 rule
train_week = int(max(pd.unique(data["week_number"]))*0.8)

In [None]:
data_train = data[data["week_number"] < train_week]
data_valid = data[data["week_number"] >= train_week]

In [None]:
data_train_x = data_train.drop("total_claims",1)
data_train_y = data_train["total_claims"]

In [None]:
data_valid_x = data_valid.drop("total_claims",1)
data_valid_y = data_valid["total_claims"]

Also prepare it for balanced dataset

In [None]:
data_balance_train = data_balance[data_balance["week_number"] < train_week]
data_balance_valid = data_balance[data_balance["week_number"] >= train_week]

In [None]:
data_balance_train_x = data_balance_train.drop("total_claims",1)
data_balance_train_y = data_balance_train["total_claims"]

In [None]:
data_balance_valid_x = data_balance_valid.drop("total_claims",1)
data_balance_valid_y = data_balance_valid["total_claims"]

In [None]:
data_balance_valid_y.shape

In [None]:
# Model 4 Hierarchical ARIMA model
data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['county']}_{x['uu_id']}", axis=1)

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)

In [None]:
df_bottom_level = data_balance.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance.groupby(["week_number", "state"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})

In [None]:
df_bottom_level = data_balance.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})

In [None]:
df_bottom_level = data_balance.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")

In [None]:
hierarchy_df

In [None]:
data_balance.head()

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)
data_balance_hts = data_balance.copy()
data_balance_hts = data_balance_hts[["week_number", "uu_id", "countyfips", "total_claims", "county_tract"]]

In [None]:
data_balance_hts

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")

In [None]:
hierarchy_df

In [None]:
data_balance_hts

In [None]:
hierarchy_df = Null

In [None]:
hierarchy_df = null

In [None]:
hierarchy_df = 0

In [None]:
data_balance_hts.sort_values(by='week_number',inplace=True)

In [None]:
data_balance_hts

In [None]:
hierarchy_df

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")

In [None]:
hierarchy_df

In [None]:
hierarchy_df.head()

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df.fillNA(0)

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df.fillna(0)

In [None]:
hierarchy_df

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df = hierarchy_df.fillna(0)

In [None]:
hierarchy_df

In [None]:
df_bottom_level

In [None]:
df_middle_level

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
#hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df = hierarchy_df.fillna(0)

In [None]:
hierarchy_df

In [None]:
hierarchy_df["total"].plot(title="Trips - total level");

In [None]:
ax = hierarchy_df[hierarchy['total']].plot(title="Trips - state level")
ax.legend(bbox_to_anchor=(1.0, 1.0));

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
#hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df = hierarchy_df.fillna(0)

In [None]:
county = data_balance_hts["countyfips"].unique()
tract = data_balance_hts["county_tract"].unique()

In [None]:
total = {'total': list(county)}
county = {k: [v for v in tract if v.startswith(k)] for k in county}
hierarchy = {**total, **county}

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)
data_balance_hts = data_balance.copy()
data_balance_hts = data_balance_hts[["week_number", "uu_id", "countyfips", "total_claims", "county_tract"]]
data_balance_hts["countyfips] 

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)
data_balance_hts = data_balance.copy()
data_balance_hts = data_balance_hts[["week_number", "uu_id", "countyfips", "total_claims", "county_tract"]]
data_balance_hts["countyfips"] = str(data_balance_hts["countyfips"]) 

In [None]:
data_balance_hts.sort_values(by='week_number',inplace=True)

In [None]:
data_balance_hts

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)
data_balance_hts = data_balance.copy()
data_balance_hts = data_balance_hts[["week_number", "uu_id", "countyfips", "total_claims", "county_tract"]]
data_balance_hts["countyfips"] = data_balance_hts["countyfips"].astype(str)

In [None]:
data_balance_hts.sort_values(by='week_number',inplace=True)

In [None]:
data_balance_hts

In [None]:
df_bottom_level = data_balance_hts.pivot(index="week_number", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["week_number", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="week_number", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("week_number")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
#hierarchy_df.index = pd.to_datetime(hierarchy_df.index)

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df = hierarchy_df.fillna(0)

In [None]:
county = data_balance_hts["countyfips"].unique()
tract = data_balance_hts["county_tract"].unique()

In [None]:
total = {'total': list(county)}
county = {k: [v for v in tract if v.startswith(k)] for k in county}
hierarchy = {**total, **county}

In [None]:
ax = hierarchy_df[hierarchy['total']].plot(title="Trips - state level")
ax.legend(bbox_to_anchor=(1.0, 1.0));

In [None]:
hierarchy

In [None]:
model_ols_arima = hts.HTSRegressor(model='auto_arima', revision_method='OLS', n_jobs=0)
model_ols_arima = model_ols_arima.fit(hierarchy_df, hierarchy)
pred_ols_arima = model_ols_arima.predict(steps_ahead=10)

In [None]:
data_balance

In [None]:
# Model 4 Hierarchical ARIMA model
# data_balance.groupby("countyfips")["uu_id"].apply(set).to_frame()
data_balance["county_tract"] = data_balance.apply(lambda x: f"{x['countyfips']}_{x['uu_id']}", axis=1)
data_balance_hts = data_balance.copy()
data_balance_hts = data_balance_hts[["date", "uu_id", "countyfips", "total_claims", "county_tract"]]
data_balance_hts["countyfips"] = data_balance_hts["countyfips"].astype(str)
data_balance_hts.sort_values(by='date',inplace=True)

In [None]:
data_balance

In [None]:
data_balance_hts

In [None]:
df_bottom_level = data_balance_hts.pivot(index="date", columns="county_tract", values="total_claims")
df_middle_level = data_balance_hts.groupby(["date", "countyfips"]) \
                    .sum() \
                    .reset_index(drop=False) \
                    .pivot(index="date", columns="countyfips", values="total_claims")
df_total = data_balance_hts.groupby("date")["total_claims"] \
             .sum() \
             .to_frame() \
             .rename(columns={"total_claims": "total"})
hierarchy_df = df_bottom_level.join(df_middle_level) \
                              .join(df_total)
hierarchy_df.index = pd.to_datetime(hierarchy_df.index)
hierarchy_df = hierarchy_df.resample("7D") \
                           .sum()

In [None]:
print(f"Number of time series at the bottom level: {df_bottom_level.shape[1]}")
print(f"Number of time series at the middle level: {df_middle_level.shape[1]}")
hierarchy_df = hierarchy_df.fillna(0)

In [None]:
county = data_balance_hts["countyfips"].unique()
tract = data_balance_hts["county_tract"].unique()

In [None]:
total = {'total': list(county)}
county = {k: [v for v in tract if v.startswith(k)] for k in county}
hierarchy = {**total, **county}

In [None]:
ax = hierarchy_df[hierarchy['total']].plot(title="Trips - state level")
ax.legend(bbox_to_anchor=(1.0, 1.0));

In [None]:
hierarchy_df

In [None]:
model_ols_arima = hts.HTSRegressor(model='auto_arima', revision_method='OLS', n_jobs=0)
model_ols_arima = model_ols_arima.fit(hierarchy_df, hierarchy)
pred_ols_arima = model_ols_arima.predict(steps_ahead=10)

In [None]:
pred_ols_arima 

In [None]:
# Based on MAPE and MSPE, now the ARIMA model has best prediction, so the following prediction is based on ARIMA model
data_pred = data_pred_query.copy()

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
data_pred["total_claims"] = 0
data_pred = dataIdentifyDWM(data_pred)

In [None]:
data_pred

In [None]:
pred_ols_arima.columns()

In [None]:
pred_ols_arima.columns

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pd.melt(pred, id_vars='date', value_vars=pred_col)

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()

In [None]:
pred

In [None]:
pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long["uu_id"]= pred_long["variable"].str.split("_", n = 1, expand = False)

In [None]:
pred_long

In [None]:
pred_long["uu_id"]= pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"]= pred_long["variable"].str.split("_", n = 1, expand = False)

In [None]:
pred_long["new_var"]= pred_long["variable"].str.split("_", n = 1, expand = False)
pred_long["uu_id"] = pred_long["new_var"][-1]

In [None]:
pred_long["new_var"]= pred_long["variable"].str.split("_", n = 1, expand = False)
pred_long["uu_id"] = pred_long["new_var"][0]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = False)
pred_long["uu_id"] = new[1]

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = False)
pred_long["uu_id"] = new[1]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = False)

In [None]:
new

In [None]:
new[1]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]

In [None]:
pred_long

In [None]:
data_pred

In [None]:
pred_long["uu_id"] = new[1]
pred_long.rename(columns={"index": "date"})

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
data_pred

In [None]:
pred_long

In [None]:
data_pred_join = pd.merge（data_pred, pred_long,  how='left', left_on=['date','uu_id'],

In [None]:
data_pred_join = pd.merge（data_pred, pred_long,  how='left', left_on=['date','uu_id'])

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'])

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join

In [None]:
# Based on MAPE and MSPE, now the ARIMA model has best prediction, so the following prediction is based on ARIMA model
data_pred = data_pred_query.copy()

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
data_pred = dataIdentifyDWM(data_pred)

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
data_pred = dataIdentifyDWM(data_pred)

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'B']=0

In [None]:
data_pred_join

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
data_pred = dataIdentifyDWM(data_pred)

In [None]:
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'total_claims']=0

In [None]:
data_pred_join

In [None]:
data_pred = data_pred_join[["uu_id", "total_claims", "week_number"]]

In [None]:
data_pred

In [None]:
## This can also be a good place for you to cleanup any input/output and export your results to a file.
data_pred.to_csv("submission_prediction_output.csv", index=False)

In [None]:
## This can also be a good place for you to cleanup any input/output and export your results to a file.
data_pred.to_csv("submission_prediction_output.csv", index=False)

In [None]:
data_pred_query

In [None]:
# Based on MAPE and MSPE, now the ARIMA model has best prediction, so the following prediction is based on ARIMA model
data_pred = data_pred_query.copy()

In [None]:
data_pred

In [None]:
data_pred = dataIdentifyDWM(data_pred)

In [None]:
data_pred

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'total_claims']=0

In [None]:
data_pred_join

In [None]:
data_pred

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long.sort_values('value', ascending=False).drop_duplicates('uu_id').sort_index()

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long = pred_long.sort_values('value', ascending=False).drop_duplicates('uu_id').sort_index()

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'total_claims']=0

In [None]:
data_pred

In [None]:
data_pred = data_pred_join[["uu_id", "total_claims", "week_number"]]

In [None]:
data_pred

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long

In [None]:
pred_long = pred_long.sort_values('value', ascending=False).drop_duplicates('uu_id').sort_index()

In [None]:
pred_long

In [None]:
data_pred_join

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long = pred_long[pred_long[date] == "2022-10-22"]

In [None]:
# Prediction with Hierarchical ARIMA model, for week 43
pred_col = pred_ols_arima.columns
pred = pred_ols_arima.copy()
pred = pred.reset_index()
pred_long = pd.melt(pred, id_vars='index', value_vars=pred_col)

In [None]:
pred_long = pred_long[pred_long['variable'].str.contains('_')]

In [None]:
new = pred_long["variable"].str.split("_", n = 1, expand = True)

In [None]:
pred_long["uu_id"] = new[1]
pred_long = pred_long.rename(columns={"index": "date"})

In [None]:
pred_long = pred_long[pred_long["date"] == "2022-10-22"]

In [None]:
pred_long

In [None]:
pred_long = pred_long.sort_values('value', ascending=False).drop_duplicates('uu_id').sort_index()

In [None]:
pred_long

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'total_claims']=0

In [None]:
pred_long

In [None]:
pred_long

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred

In [None]:
# Based on MAPE and MSPE, now the ARIMA model has best prediction, so the following prediction is based on ARIMA model
data_pred = data_pred_query.copy()

In [None]:
data_pred = dataIdentifyDWM(data_pred)

In [None]:
data_pred_join = pd.merge(data_pred, pred_long,  how='left', left_on=['date','uu_id'], right_on=['date','uu_id'])

In [None]:
data_pred_join = data_pred_join.rename(columns={"value": "total_claims"})

In [None]:
data_pred_join.loc[data_pred_join['total_claims']<0,'total_claims']=0

In [None]:
data_pred = data_pred_join[["uu_id", "total_claims", "week_number"]]

In [None]:
data_pred

In [None]:
## This can also be a good place for you to cleanup any input/output and export your results to a file.
data_pred.to_csv("submission_prediction_output.csv", index=False)

In [None]:
data_pred