In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math

In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math

- DEFINE YOUR CLASSES AND FUNCTIONS 
-----------------------------------
This is not required, but is helpful in keeping your notebook organized. 
You can use the following cell or several cells to define your functions
and classes to keep them separate from your analysis or results code.
In general it useful to define your methods in a separate cell from where
it is run.

In [None]:
def dataExplore(data):
    '''
    Explore dataframe
    '''
    print("# of observations: ", data.shape[0])
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            print("# of %s: %s" % (col, len(pd.unique(data[col]))))
        else:
            print("Unique value of %s: %s" % (col, pd.unique(data[col])))

In [None]:
def dataBalanceCheck(data):
    '''
    Check the balance of data frame
    '''
    unbalance_count = 0
    print("# of observations in complete time series: ", len(pd.unique(data["week_number"])))
    for id in pd.unique(data["uu_id"]):
        if len(data[data["uu_id"] == id]) <  len(pd.unique(data["week_number"])):
            print(id, len(data[data["uu_id"] == id]))
            unbalance_count += 1
    print("% of tracts with incomplete time series: ", unbalance_count / len(pd.unique(data["uu_id"]))*100)

In [None]:
def dataFillNa(data, value):
    """
   fill NA with given value in the dataframe
    """
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            pass
        elif col in ["top_category_employer1", "top_category_employer2", "top_category_employer3"]:
            data[col] = data[col].replace({'N/A':str(value)})
        else:
            data[col] = data[col].fillna(value)
    return(data)

In [None]:
def dataIdentifyDWM(data):
    '''
    Input: # of week. Output: data for the first day, its month and week order in the month
    '''
    data["date"] = pd.to_datetime(2022 * 1000 + (1+(data["week_number"]-1)*7), format='%Y%j')
    data["month"] = pd.DatetimeIndex(data["date"]).month
    data["weekofmonth"]= pd.to_numeric(data["date"].dt.day/7)
    data['weekofmonth'] = data['weekofmonth'].apply(lambda x: math.ceil(x))
    return(data)

In [None]:
def MSPE(s1, s2):
    print("MSPE: ", sum((s1 - s2)**2)/len(s1))

In [None]:
def MAPE(s1, s2):
    print("MAPE: ", sum(abs(s1 - s2))/len(s1))

In [None]:
# Obtain data using BigQuery
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query = """
SELECT
a.*,
b.average_wage
FROM 
(SELECT 
*
FROM `ironhacks-data.ironhacks_competition.unemployment_data`) a
JOIN `ironhacks-data.ironhacks_competition.wage_data` b 
ON a.uu_id=b.uu_id
"""

In [None]:
query_job = bigquery_client.query(query)
data = query_job.to_dataframe()

In [None]:
query_pred = """
SELECT * FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
query_job_pred = bigquery_client.query(query_pred)
data_pred_query= query_job_pred.to_dataframe()

In [None]:
# Further check tracts with average_wage as Nan
# I find three tracts with all average_wage as Nan. If I drop these tracts due to Nan value, they cannot be predicted
for id in pd.unique(data[data['average_wage'].isna()]["uu_id"]):
    print(id)
    print("All value are nan?", data[data['uu_id'] == id]["average_wage"].isnull().all())
    print("Included in prediction list?", len(data_pred_query[data_pred_query['uu_id'] == id]) > 0)

In [None]:
# Backup the data before pre-treatment
data_backup = data.copy()
data_pred_query_backup = data_pred_query.copy()

In [None]:
# Pretreatment: convert week_number to month and week of month, to capture seasonality
data = dataIdentifyDWM(data)

In [None]:
# To balance the dataset as panel data
data_balance = data.set_index('week_number')
data_balance = data_balance.sort_index(ascending=False)
data_balance = data_balance.set_index('uu_id',append=True)
data_balance = data_balance[~data_balance.index.duplicated(keep='first')]

In [None]:
data_balance = data_balance.reset_index(level=['week_number'])
data_balance = (data_balance.set_index('week_number',append=True).reindex(pd.MultiIndex.from_product([data_balance.index.unique(),
                                                      range(data_balance.week_number.min(),data_balance.week_number.max()+1)],
                                                     names=['uu_id','week_number'])).reset_index(level=1))

In [None]:
data_balance = data_balance.set_index('week_number',append=True)
data_balance['total_claims'] = data_balance['total_claims'].fillna(0)
data_balance['average_wage'] = data_balance['average_wage'].interpolate(method = "linear")

In [None]:
data_balance = data_balance.reset_index(level=['uu_id', "week_number"])
data_balance = dataIdentifyDWM(data_balance)

In [None]:
dataBalanceCheck(data_balance)

In [None]:
# Data clean up: convert NA to 0 for gender, race, education and top employer and recalculate unknown category
# Based on the check of Nan in average_wage above, I also convert Nan to zero as well, but try models with and without "average_wage" variable
data = dataFillNa(data, 0)

In [None]:
# Split data to training and validaton sets
train_week = max(pd.unique(data["week_number"]))

In [None]:
data_train = data[data["week_number"] < train_week]
data_valid = data[data["week_number"] >= train_week]

In [None]:
data_train_x = data_train.drop("total_claims",1)
data_train_y = data_train["total_claims"]

In [None]:
data_valid_x = data_valid.drop("total_claims",1)
data_valid_y = data_valid["total_claims"]

Also prepare it for balanced dataset

In [None]:
data_balance_train = data_balance[data_balance["week_number"] < train_week]
data_balance_valid = data_balance[data_balance["week_number"] >= train_week]

In [None]:
data_balance_train_x = data_balance_train.drop("total_claims",1)
data_balance_train_y = data_balance_train["total_claims"]

In [None]:
data_balance_valid_x = data_balance_valid.drop("total_claims",1)
data_balance_valid_y = data_balance_valid["total_claims"]

In [None]:
# Model 1 : Poisson regression with unbalanced data
data_train_x_m1 = data_train_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_train_x_m1["month"] = data_train_x_m1["month"].astype(str)
data_train_x_m1["weekofmonth"] = data_train_x_m1["weekofmonth"].astype(str)
data_train_x_m1["week_number2"] = data_train_x_m1["week_number"]**2
data_train_x_m1 = pd.get_dummies(data_train_x_m1)

In [None]:
data_valid_x_m1 = data_valid_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_valid_x_m1["month"] = data_valid_x_m1["month"].astype(str)
data_valid_x_m1["weekofmonth"] = data_valid_x_m1["weekofmonth"].astype(str)
data_valid_x_m1["week_number2"] = data_valid_x_m1["week_number"]**2
data_valid_x_m1 = pd.get_dummies(data_valid_x_m1)

In [None]:
for i in range(8):
    data_valid_x_m1["month_"+str(1+i)] = 0

In [None]:
for i in range(5):
    if i == 1:
        pass
    data_valid_x_m1["weekofmonth_"+str(1+i)] = 0

In [None]:
poission_model = sm.GLM(data_train_y.astype(int), data_train_x_m1.astype(float), family=sm.families.Poisson())
result = poission_model.fit()
result.summary()

In [None]:
data_estimate_m1 = result.predict(data_valid_x_m1.astype(float))
MSPE(data_estimate_m1, data_valid_y)
MAPE(data_estimate_m1, data_valid_y)

In [None]:
# Model 2: Poisson with balanced data
data_balance_train_x_m1 = data_balance_train_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_balance_train_x_m1["month"] = data_balance_train_x_m1["month"].astype(str)
data_balance_train_x_m1["weekofmonth"] = data_balance_train_x_m1["weekofmonth"].astype(str)
data_balance_train_x_m1["week_number2"] = data_balance_train_x_m1["week_number"]**2
data_balance_train_x_m1 = pd.get_dummies(data_balance_train_x_m1)

In [None]:
data_balance_valid_x_m1 = data_balance_valid_x[["week_number","month", "weekofmonth",  "average_wage"]]
data_balance_valid_x_m1["month"] = data_balance_valid_x_m1["month"].astype(str)
data_balance_valid_x_m1["weekofmonth"] = data_balance_valid_x_m1["weekofmonth"].astype(str)
data_balance_valid_x_m1["week_number2"] = data_balance_valid_x_m1["week_number"]**2
data_balance_valid_x_m1 = pd.get_dummies(data_balance_valid_x_m1)

In [None]:
for i in range(8):
    data_balance_valid_x_m1["month_"+str(1+i)] = 0

In [None]:
for i in range(5):
    if i == 1:
        pass
    data_balance_valid_x_m1["weekofmonth_"+str(1+i)] = 0

In [None]:
poission_model_m2 = sm.GLM(data_balance_train_y.astype(int), data_balance_train_x_m1.astype(float), family=sm.families.Poisson())
result_m2 = poission_model_m2.fit()
result_m2.summary()

In [None]:
data_balance_estimate_m2 = result_m2.predict(data_balance_valid_x_m1.astype(float))
MSPE(data_balance_estimate_m2, data_balance_valid_y)
MAPE(data_balance_estimate_m2, data_balance_valid_y)

In [None]:
# Although using a balanced model has better fit on the training set, the MSPE and MAPE are still larger then the first model.
# So for this submission, I sitll use the m1 for prediction.
data_lastWeek = data[data["week_number"] == train_week][["uu_id", "average_wage"]]
data_lastWeek = data_lastWeek.drop_duplicates()

In [None]:
data_pred = data_pred_query.copy()
data_pred = data_pred.set_index('uu_id').join(data_lastWeek.set_index('uu_id'))
data_pred.head()
data_pred = dataIdentifyDWM(data_pred)
data_pred = dataFillNa(data_pred, 0)
data_pred["month"] = data_pred["month"].astype(str)
data_pred = data_pred.drop("date",1)
data_pred = pd.get_dummies(data_pred)
for i in range(8):
    data_pred["month_"+str(1+i)] = 0
for i in range(5):
    if i == 1:
        pass
    data_pred["weekofmonth_"+str(1+i)] = 0

In [None]:
output = result.predict(data_pred.astype(float))
output_df = pd.DataFrame(output, columns=["total_claims"])["total_claims"]
output_df["uu_id"] = output_df.index

In [None]:
data_pred_query = data_pred_query.merge(output_df, on = "uu_id")
data_pred_query = data_pred_query[["uu_id", "total_claims", "week_number"]]

In [None]:
## This can also be a good place for you to cleanup any input/output and export your results to a file.
data_pred_query.to_csv("submission_prediction_output.csv", index=False)