In [1]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [2]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
import statsmodels.api as sm

- DEFINE YOUR CLASSES AND FUNCTIONS 
-----------------------------------
This is not required, but is helpful in keeping your notebook organized. 
You can use the following cell or several cells to define your functions
and classes to keep them separate from your analysis or results code.
In general it useful to define your methods in a separate cell from where
it is run.

In [3]:
# explore dataframe
def dataExplore(data):
    print("# of observations: ", data.shape[0])
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            print("# of %s: %s" % (col, len(pd.unique(data[col]))))
        else:
            print("Unique value of %s: %s" % (col, pd.unique(data[col])))

In [4]:
# check balance of data
def dataBalanceCheck(data):
    unbalance_count = 0
    print("# of observations in complete time series: ", len(pd.unique(data["week_number"])))
    for id in pd.unique(data["uu_id"]):
        if len(data[data["uu_id"] == id]) <  len(pd.unique(data["week_number"])):
            print(id, len(data[data["uu_id"] == id]))
            unbalance_count += 1
    print("% of tracts with incomplete time series: ", unbalance_count / len(pd.unique(data["uu_id"]))*100)

In [5]:
# fill NA with given value
def dataFillNa(data, value):
    for col in data.columns:
        if col in ["uu_id", "timeperiod", "week_number", "countyfips", "tract", "tract_name", "date"]:
            pass
        elif col in ["top_category_employer1", "top_category_employer2", "top_category_employer3"]:
            data[col] = data[col].replace({'N/A':str(value)})
        else:
            data[col] = data[col].fillna(value)
    return(data)
def dataIdentifyDateMonth(data):
    data["date"] = pd.to_datetime(2022 * 1000 + (1+(data["week_number"]-1)*7), format='%Y%j')
    data["month"] = pd.DatetimeIndex(data["date"]).month
    return(data)

In [6]:
# Obtain data using BigQuery
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [7]:
query = """
SELECT
a.*,
b.average_wage
FROM 
(SELECT 
*
FROM `ironhacks-data.ironhacks_competition.unemployment_data`) a
JOIN `ironhacks-data.ironhacks_competition.wage_data` b 
ON a.uu_id=b.uu_id
"""

In [8]:
query_job = bigquery_client.query(query)
data = query_job.to_dataframe()

In [9]:
query_pred = """
SELECT * FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [10]:
query_job_pred = bigquery_client.query(query_pred)
data_pred_query= query_job_pred.to_dataframe()

In [11]:
# Explore input data for NA and special values
dataExplore(data)
dataExplore(data_pred_query)

# of observations:  16833
# of uu_id: 525
# of timeperiod: 35
# of week_number: 35
# of countyfips: 60
# of tract: 425
# of tract_name: 525
Unique value of total_claims: <IntegerArray>
[ 22, 111,  39,  14, 155,  24,  19,  20,  12,  50,
 ...
 135,  95, 105, 117, 118,  93, 137, 146, 120, 110]
Length: 120, dtype: Int64
Unique value of edu_8th_or_less: <IntegerArray>
[0, <NA>, 18, 14, 10, 19, 24, 17, 28, 16, 41, 12, 33, 21, 13, 26, 11, 37]
Length: 18, dtype: Int64
Unique value of edu_grades_9_11: <IntegerArray>
[<NA>,    0,   19,   11,   12,   10,   13,   35,   18,   14,   21,   25,   16,
   17,   37,   42,   20,   15,   26,   22,   29,   47,   33,   28,   40,   27,
   43,   24,   39,   23,   44,   36,   30,   31,   32,   41,   51,   38,   57,
   78]
Length: 40, dtype: Int64
Unique value of edu_hs_grad_equiv: <IntegerArray>
[  14,  108, <NA>,   69,   15,   10,   13,   21,   50,   44,   30,    0,   11,
   17,   18,   45,   62,   37,   12,   42,   31,   16,   41,   20,   24,   27,
   23,   6