In [1]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [2]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n\n!pip install db-dtypes pmdarima 'google-cloud-bigquery[pandas]' tqdm\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import seaborn
from google.cloud import bigquery

In [4]:
from pmdarima.arima import AutoARIMA

In [5]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [6]:
data_tables = bigquery_client.query(f"""
    SELECT table_catalog, table_schema, table_name
    FROM `ironhacks_competition.INFORMATION_SCHEMA.TABLES`
""").to_dataframe()
print(data_tables)

    table_catalog           table_schema         table_name
0  ironhacks-data  ironhacks_competition          wage_data
1  ironhacks-data  ironhacks_competition    prediction_list
2  ironhacks-data  ironhacks_competition  unemployment_data


In [7]:
# Read all data tables in ironhacks-data.ironhacks_competition
data_dict = {
    table_name: bigquery_client.query(f"""
        SELECT * FROM `ironhacks-data.ironhacks_competition.{table_name}`
    """).to_dataframe()
    for table_name in data_tables['table_name'].tolist()
}

In [8]:
common_cols = list(set(data_dict['unemployment_data'].columns) & set(data_dict['wage_data'].columns))
print(f'Common columns: {common_cols}')
data = data_dict['unemployment_data'].merge(
    data_dict['wage_data'],
    on=common_cols,
    how='left',
).sort_values(['countyfips', 'week_number']).drop_duplicates().reset_index(drop=True)
data['timeperiod'] = pd.to_datetime(data['timeperiod'], format='%Y%m%d')
data

Common columns: ['tract', 'countyfips', 'uu_id', 'tract_name']


Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,983badfd7b568728e39a2344a9006078,2022-01-01,1,18001,30200,"Census Tract 302, Adams County, Indiana",16,,,,...,,0,0,0,0,0,0,,,11816.666667
1,983badfd7b568728e39a2344a9006078,2022-01-08,2,18001,30200,"Census Tract 302, Adams County, Indiana",23,,,16,...,12,0,0,,,0,0,,18,11816.666667
2,983badfd7b568728e39a2344a9006078,2022-01-15,3,18001,30200,"Census Tract 302, Adams County, Indiana",18,,,12,...,,0,0,,,,0,,,11816.666667
3,983badfd7b568728e39a2344a9006078,2022-01-29,5,18001,30200,"Census Tract 302, Adams County, Indiana",20,,,,...,,0,0,,0,,0,,16,11816.666667
4,983badfd7b568728e39a2344a9006078,2022-02-05,6,18001,30200,"Census Tract 302, Adams County, Indiana",16,0,,10,...,,0,0,,0,,0,0,,11816.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13972,fb55464f8e34af6d750d06968bf719b8,2022-08-06,32,18183,50400,"Census Tract 504, Whitley County, Indiana",14,0,,,...,,0,0,0,0,0,0,,,13613.500000
13973,fb55464f8e34af6d750d06968bf719b8,2022-08-13,33,18183,50400,"Census Tract 504, Whitley County, Indiana",21,0,,16,...,,0,0,0,0,,0,,,13613.500000
13974,fb55464f8e34af6d750d06968bf719b8,2022-08-20,34,18183,50400,"Census Tract 504, Whitley County, Indiana",11,0,,,...,,0,0,0,0,0,0,,,13613.500000
13975,0f3d45341a5b113b813ffb7be7f58bab,2022-09-10,37,18183,50300,"Census Tract 503, Whitley County, Indiana",31,0,,26,...,,0,0,0,,0,0,0,,13992.250000


In [9]:
# For our own convenience, create a correspondance DataFrame for `week_number` and `timeperiod`.
wt = pd.Series(
    range(1, 53),
    name='week_number',
    index=pd.date_range('2022-01-01', periods=52, freq='W-SAT').rename('timeperiod'),
).reset_index()
wt['timeperiod'] = wt['timeperiod'].astype(str)
wt

Unnamed: 0,timeperiod,week_number
0,2022-01-01,1
1,2022-01-08,2
2,2022-01-15,3
3,2022-01-22,4
4,2022-01-29,5
5,2022-02-05,6
6,2022-02-12,7
7,2022-02-19,8
8,2022-02-26,9
9,2022-03-05,10


In [10]:
# The following shows that there are a lot of missing data in exogenous features...
for cname, cvalues in data.items():
    print('Column {} has {} ({}%) missing value(s)'.format(
        cname,
        cvalues.isna().sum(),
        round(100.0 * cvalues.isna().sum() / len(cvalues), 2),
    ))

Column uu_id has 0 (0.0%) missing value(s)
Column timeperiod has 0 (0.0%) missing value(s)
Column week_number has 0 (0.0%) missing value(s)
Column countyfips has 0 (0.0%) missing value(s)
Column tract has 0 (0.0%) missing value(s)
Column tract_name has 0 (0.0%) missing value(s)
Column total_claims has 0 (0.0%) missing value(s)
Column edu_8th_or_less has 2532 (18.12%) missing value(s)
Column edu_grades_9_11 has 9084 (64.99%) missing value(s)
Column edu_hs_grad_equiv has 8895 (63.64%) missing value(s)
Column edu_post_hs has 11305 (80.88%) missing value(s)
Column edu_unknown has 3922 (28.06%) missing value(s)
Column top_category_employer1 has 0 (0.0%) missing value(s)
Column top_category_employer2 has 0 (0.0%) missing value(s)
Column top_category_employer3 has 0 (0.0%) missing value(s)
Column gender_female has 10800 (77.27%) missing value(s)
Column gender_male has 10703 (76.58%) missing value(s)
Column gender_na has 784 (5.61%) missing value(s)
Column race_amerindian has 1275 (9.12%) miss

In [11]:
train_weeks = 37
train_timeperiods = pd.to_datetime(wt.set_index('week_number')['timeperiod'].loc[:train_weeks])
target_week = 39
pred_results = []
for i in tqdm(data['uu_id'].unique()):
    y = data[
        data['uu_id'] == i
    ].set_index('timeperiod')['total_claims'].asfreq('W-SAT').reindex(train_timeperiods).fillna(0)
    n_periods = target_week - train_weeks
    pred = AutoARIMA(
        seasonal=False,
    ).fit_predict(y, n_periods=n_periods).iloc[-1]
    pred_results.append([i, max(0.0, pred)])

  0%|          | 0/525 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Getting the output CSV for submission ready
data_dict['prediction_list'].merge(
    pd.DataFrame(pred_results, columns=['uu_id', 'total_claims']),
    on=['uu_id'],
    how='left',
).to_csv('submission_prediction_output.csv', index=False)