In [None]:
visual_data = data.groupby(['uu_id'])['total_claims'].sum().reset_index().merge(
    data_dict['wage_data'],
    on=['uu_id'],
    how='inner',
)
sns.scatterplot(data=visual_data, x='total_claims', y='average_wage', marker='+')

In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n\n!pip install db-dtypes pmdarima 'google-cloud-bigquery[pandas]' tqdm\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
from google.cloud import bigquery

In [None]:
from pmdarima.arima import AutoARIMA

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
def print_missing(data: pd.DataFrame):
    """Show how much missing data in each column of the input DataFrame."""
    for cname, cvalues in data.items():
        print('Column {} has {} ({}%) missing value(s)'.format(
            cname,
            cvalues.isna().sum(),
            round(100.0 * cvalues.isna().sum() / len(cvalues), 2),
        ))

In [None]:
data_tables = bigquery_client.query(f"""
    SELECT table_catalog, table_schema, table_name
    FROM `ironhacks_competition.INFORMATION_SCHEMA.TABLES`
""").to_dataframe()
print(data_tables)

In [None]:
# Read all data tables in ironhacks-data.ironhacks_competition
data_dict = {
    table_name: bigquery_client.query(f"""
        SELECT * FROM `ironhacks-data.ironhacks_competition.{table_name}`
    """).to_dataframe()
    for table_name in data_tables['table_name'].tolist()
}

In [None]:
common_cols = list(set(data_dict['unemployment_data'].columns) & set(data_dict['wage_data'].columns))
print(f'Common columns: {common_cols}')
data = data_dict['unemployment_data'].merge(
    data_dict['wage_data'],
    on=common_cols,
    how='left',
).sort_values(['countyfips', 'week_number']).drop_duplicates().reset_index(drop=True)
data['timeperiod'] = pd.to_datetime(data['timeperiod'], format='%Y%m%d')
data

In [None]:
print(data.columns)
print_missing(data)

In [None]:
for cname in ['edu', 'gender', 'race']:
    cols = [c for c in data.columns if c.startswith(cname)]
    data[cols] = data[cols].fillna(0)
    data[f'{cname}_missing'] = data['total_claims'] - data[cols].sum(axis=1)

In [None]:
cols = [c for c in data.columns if c.startswith('top_category_employer')]
data[cols] = data[cols].replace('N/A', None)
# data = pd.get_dummies(data, columns=cols, dummy_na=True)
print_missing(data)

In [None]:
visual_data = data.groupby(['uu_id'])['total_claims'].sum().reset_index().merge(
    data_dict['wage_data'],
    on=['uu_id'],
    how='inner',
)
sns.scatterplot(data=visual_data, x='total_claims', y='average_wage', marker='+')

In [None]:
cols_race = [c for c in data.columns if c.startswith('race')]
data.groupby(['timeperiod'])[cols_race].sum().plot(legend=True)

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery

In [None]:
from pmdarima.arima import AutoARIMA

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
for cname in ['edu', 'gender', 'race']:
    plt.figure()
    cols = [c for c in data.columns if c.startswith(cname)]
    data.groupby(['timeperiod'])[cols_race].sum().plot(legend=True)

In [None]:
for cname in ['edu', 'gender', 'race']:
    plt.figure()
    cols = [c for c in data.columns if c.startswith(cname)]
    data.groupby(['timeperiod'])[cols].sum().plot(legend=True)

In [None]:
# For our own convenience, create a correspondance DataFrame for `week_number` and `timeperiod`.
wt = pd.Series(
    range(1, 53),
    name='week_number',
    index=pd.date_range('2022-01-01', periods=52, freq='W-SAT').rename('timeperiod'),
).reset_index()
wt['timeperiod'] = wt['timeperiod'].astype(str)
wt.head()