In [1]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [2]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n!pip install pmdarima\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [3]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima

In [4]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n!pip install pmdarima\n!pip install plotly==5.11.0\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [5]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima

- DEFINE YOUR CLASSES AND FUNCTIONS 
-----------------------------------
This is not required, but is helpful in keeping your notebook organized. 
You can use the following cell or several cells to define your functions
and classes to keep them separate from your analysis or results code.
In general it useful to define your methods in a separate cell from where
it is run.

In [6]:
def dataExplore(data):
    '''
    Explore dataframe
    '''
    print("# of observations: ", data.shape[0])
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            print("# of %s: %s" % (col, len(pd.unique(data[col]))))
        else:
            print("Unique value of %s: %s" % (col, pd.unique(data[col])))

In [7]:
def dataBalanceCheck(data):
    '''
    Check the balance of data frame
    '''
    unbalance_count = 0
    print("# of observations in complete time series: ", len(pd.unique(data["week_number"])))
    for id in pd.unique(data["uu_id"]):
        if len(data[data["uu_id"] == id]) <  len(pd.unique(data["week_number"])):
            print(id, len(data[data["uu_id"] == id]))
            unbalance_count += 1
    print("% of tracts with incomplete time series: ", unbalance_count / len(pd.unique(data["uu_id"]))*100)

In [8]:
def dataFillNa(data, value):
    """
   fill NA with given value in the dataframe
    """
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            pass
        elif col in ["top_category_employer1", "top_category_employer2", "top_category_employer3"]:
            data[col] = data[col].replace({'N/A':str(value)})
        else:
            data[col] = data[col].fillna(value)
    return(data)

In [9]:
def dataIdentifyDWM(data):
    '''
    Input: # of week. Output: data for the first day, its month and week order in the month
    '''
    data["date"] = pd.to_datetime(2022 * 1000 + (1+(data["week_number"]-1)*7), format='%Y%j')
    data["month"] = pd.DatetimeIndex(data["date"]).month
    data["weekofmonth"]= pd.to_numeric(data["date"].dt.day/7)
    data['weekofmonth'] = data['weekofmonth'].apply(lambda x: math.ceil(x))
    return(data)

In [10]:
def MSPE(s1, s2):
    return(sum((s1 - s2)**2)/len(s1))

In [11]:
def MAPE(s1, s2):
    return(sum(abs(s1 - s2))/len(s1))

In [12]:
def ARIMA_predict(df_input, cutoff_rate = 0.8, n_period = 15):
    cutoff = int(cutoff_rate * len(df_input))
    if cutoff_rate < 1:
        valid = df_input[cutoff:]
    train = df_input[:cutoff]
    model = auto_arima(train, trace=False, error_action='ignore', suppress_warnings=True)
    model.fit(train)
    forecast = model.predict(n_period)
    return(forecast)

In [13]:
# Obtain data using BigQuery
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [14]:
query = """
SELECT
a.*,
b.average_wage
FROM 
(SELECT 
*
FROM `ironhacks-data.ironhacks_competition.unemployment_data`) a
JOIN `ironhacks-data.ironhacks_competition.wage_data` b 
ON a.uu_id=b.uu_id
"""

In [15]:
query_job = bigquery_client.query(query)
data = query_job.to_dataframe()

In [16]:
query_pred = """
SELECT * FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [17]:
query_job_pred = bigquery_client.query(query_pred)
data_pred_query= query_job_pred.to_dataframe()

In [18]:
# Explore input data for NA and special values
dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()

# of observations:  16833
# of uu_id: 525
# of timeperiod: 35
# of week_number: 35
# of countyfips: 60
# of tract: 425
# of tract_name: 525
Unique value of total_claims: <IntegerArray>
[ 22, 111,  39,  14, 155,  24,  19,  20,  12,  50,
 ...
 135,  95, 105, 117, 118,  93, 137, 146, 120, 110]
Length: 120, dtype: Int64
Unique value of edu_8th_or_less: <IntegerArray>
[0, <NA>, 18, 14, 10, 19, 24, 17, 28, 16, 41, 12, 33, 21, 13, 26, 11, 37]
Length: 18, dtype: Int64
Unique value of edu_grades_9_11: <IntegerArray>
[<NA>,    0,   19,   11,   12,   10,   13,   35,   18,   14,   21,   25,   16,
   17,   37,   42,   20,   15,   26,   22,   29,   47,   33,   28,   40,   27,
   43,   24,   39,   23,   44,   36,   30,   31,   32,   41,   51,   38,   57,
   78]
Length: 40, dtype: Int64
Unique value of edu_hs_grad_equiv: <IntegerArray>
[  14,  108, <NA>,   69,   15,   10,   13,   21,   50,   44,   30,    0,   11,
   17,   18,   45,   62,   37,   12,   42,   31,   16,   41,   20,   24,   27,
   23,   6

In [19]:
# Explore input data for NA and special values
# dataExplore(data)
dataExplore(data_pred_query)
# data_pred_query.head()

# of observations:  525
# of uu_id: 525
# of week_number: 1


In [20]:
# Explore input data for NA and special values
# dataExplore(data)
# dataExplore(data_pred_query)
data_pred_query.head()

Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44


In [21]:
# Explore input data for NA and special values
dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()

# of observations:  16833
# of uu_id: 525
# of timeperiod: 35
# of week_number: 35
# of countyfips: 60
# of tract: 425
# of tract_name: 525
Unique value of total_claims: <IntegerArray>
[ 22, 111,  39,  14, 155,  24,  19,  20,  12,  50,
 ...
 135,  95, 105, 117, 118,  93, 137, 146, 120, 110]
Length: 120, dtype: Int64
Unique value of edu_8th_or_less: <IntegerArray>
[0, <NA>, 18, 14, 10, 19, 24, 17, 28, 16, 41, 12, 33, 21, 13, 26, 11, 37]
Length: 18, dtype: Int64
Unique value of edu_grades_9_11: <IntegerArray>
[<NA>,    0,   19,   11,   12,   10,   13,   35,   18,   14,   21,   25,   16,
   17,   37,   42,   20,   15,   26,   22,   29,   47,   33,   28,   40,   27,
   43,   24,   39,   23,   44,   36,   30,   31,   32,   41,   51,   38,   57,
   78]
Length: 40, dtype: Int64
Unique value of edu_hs_grad_equiv: <IntegerArray>
[  14,  108, <NA>,   69,   15,   10,   13,   21,   50,   44,   30,    0,   11,
   17,   18,   45,   62,   37,   12,   42,   31,   16,   41,   20,   24,   27,
   23,   6

In [22]:
# Explore input data for NA and special values
# dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()
data.head()

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,,14.0,...,,0,0,0.0,11.0,0.0,0.0,,,8347.125
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,,108.0,...,35.0,0,0,0.0,,,0.0,0.0,0.0,7036.636364
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,,,...,,0,0,0.0,39.0,0.0,0.0,0.0,0.0,7890.142857
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,,,...,,0,0,,10.0,0.0,0.0,,,7534.375
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,,69.0,...,65.0,0,0,0.0,135.0,,,,,11825.125


In [23]:
# Explore input data for NA and special values
dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()
data.head()

# of observations:  16833
# of uu_id: 525
# of timeperiod: 35
# of week_number: 35
# of countyfips: 60
# of tract: 425
# of tract_name: 525
Unique value of total_claims: <IntegerArray>
[ 22, 111,  39,  14, 155,  24,  19,  20,  12,  50,
 ...
 135,  95, 105, 117, 118,  93, 137, 146, 120, 110]
Length: 120, dtype: Int64
Unique value of edu_8th_or_less: <IntegerArray>
[0, <NA>, 18, 14, 10, 19, 24, 17, 28, 16, 41, 12, 33, 21, 13, 26, 11, 37]
Length: 18, dtype: Int64
Unique value of edu_grades_9_11: <IntegerArray>
[<NA>,    0,   19,   11,   12,   10,   13,   35,   18,   14,   21,   25,   16,
   17,   37,   42,   20,   15,   26,   22,   29,   47,   33,   28,   40,   27,
   43,   24,   39,   23,   44,   36,   30,   31,   32,   41,   51,   38,   57,
   78]
Length: 40, dtype: Int64
Unique value of edu_hs_grad_equiv: <IntegerArray>
[  14,  108, <NA>,   69,   15,   10,   13,   21,   50,   44,   30,    0,   11,
   17,   18,   45,   62,   37,   12,   42,   31,   16,   41,   20,   24,   27,
   23,   6

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,,14.0,...,,0,0,0.0,11.0,0.0,0.0,,,8347.125
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,,108.0,...,35.0,0,0,0.0,,,0.0,0.0,0.0,7036.636364
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,,,...,,0,0,0.0,39.0,0.0,0.0,0.0,0.0,7890.142857
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,,,...,,0,0,,10.0,0.0,0.0,,,7534.375
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,,69.0,...,65.0,0,0,0.0,135.0,,,,,11825.125


In [24]:
# Explore input data for NA and special values
# dataExplore(data)
# dataExplore(data_pred_query)
# data_pred_query.head()
data.head()

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,,14.0,...,,0,0,0.0,11.0,0.0,0.0,,,8347.125
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,,108.0,...,35.0,0,0,0.0,,,0.0,0.0,0.0,7036.636364
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,,,...,,0,0,0.0,39.0,0.0,0.0,0.0,0.0,7890.142857
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,,,...,,0,0,,10.0,0.0,0.0,,,7534.375
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,,69.0,...,65.0,0,0,0.0,135.0,,,,,11825.125


In [25]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n!pip install db-dtypes\n!python3 -m pip install pandas\n!pip install pmdarima\n!pip install plotly==5.11.0\n!pip install hts_reconciliation\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [26]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
import reconciliation_hts

ModuleNotFoundError: No module named 'reconciliation_hts'

In [27]:
%%capture

UsageError: %%capture is a cell magic, but the cell body is empty.


In [28]:
#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED
#------------------------------------------
# This is normally not required. The hub environment comes preinstaled with 
# many packages that you can already use without setup. In case there is some
# other library you would like to use that isn't on the list you run this command
# once to install them.  If it is already installed this command has no effect.
get_ipython().system('pip install db-dtypes')
get_ipython().system('python3 -m pip install pandas')
get_ipython().system('pip install pmdarima')
get_ipython().system('pip install plotly==5.11.0')
get_ipython().system('pip install hts_reconciliation')





[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m



[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

You should consider upgrading via the '/opt/homebrew/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m













[31mERROR: Could not find a version that satisfies the requirement hts_reconciliation (from versions: none)[0m
[31mERROR: No matching distribution found for hts_reconciliation[0m


In [29]:
%%capture

UsageError: %%capture is a cell magic, but the cell body is empty.


In [30]:
#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED
#------------------------------------------
# This is normally not required. The hub environment comes preinstaled with 
# many packages that you can already use without setup. In case there is some
# other library you would like to use that isn't on the list you run this command
# once to install them.  If it is already installed this command has no effect.
get_ipython().system('pip install db-dtypes')
get_ipython().system('python3 -m pip install pandas')
get_ipython().system('pip install pmdarima')
get_ipython().system('pip install plotly==5.11.0')
get_ipython().system('pip install scikit-hts[auto-arima]')





[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m



[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
You should consider upgrading via the '/opt/homebrew/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m











zsh:1: no matches found: scikit-hts[auto-arima]


- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [31]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor

ModuleNotFoundError: No module named 'hts'

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [32]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
import hts

ModuleNotFoundError: No module named 'hts'

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [33]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [34]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor

ModuleNotFoundError: No module named 'hts'

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [35]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
from collections.abc import Iterable
import hts
from hts.hierarchy import HierarchyTree
from hts.model import AutoArimaModel
from hts import HTSRegressor

ModuleNotFoundError: No module named 'hts'

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [36]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
from collections.abc import Iterable

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [37]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm
import math
import plotly.express as px
from pmdarima.arima import auto_arima
from collections.abc import Iterable
import hts

ModuleNotFoundError: No module named 'hts'