In [1]:
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [2]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
unemp_data = query_job.to_dataframe()
unemp_data.head(5)

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,...,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0,,14.0,...,,,0,0,0.0,11.0,0.0,0.0,,
1,21957d5517323845818d87623589e1ba,20220319,12,18089,10400,"Census Tract 104, Lake County, Indiana",111,0,,108.0,...,76.0,35.0,0,0,0.0,,,0.0,0.0,0.0
2,6a5609f385912113b6f1014b958ed748,20220326,13,18089,11500,"Census Tract 115, Lake County, Indiana",39,0,,,...,,,0,0,0.0,39.0,0.0,0.0,0.0,0.0
3,46b2882ec4c373527ec33f7bd4f1388d,20220716,29,18089,20700,"Census Tract 207, Lake County, Indiana",14,0,,,...,,,0,0,,10.0,0.0,0.0,,
4,37495d17e82f7df326bfc2c4c090f7b7,20220409,15,18089,21900,"Census Tract 219, Lake County, Indiana",155,0,,69.0,...,90.0,65.0,0,0,0.0,135.0,,,,


In [3]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
wage_data = query_job.to_dataframe()
wage_data.head()

Unnamed: 0,uu_id,countyfips,tract,tract_name,average_wage
0,585f8731c2255d6b3f817a31180848b9,18177,200,"Census Tract 2, Wayne County, Indiana",6612.0
1,8c9d2aa90948679972a9382aadcc6001,18177,900,"Census Tract 9, Wayne County, Indiana",9883.25
2,0f3d45341a5b113b813ffb7be7f58bab,18183,50300,"Census Tract 503, Whitley County, Indiana",13992.25
3,fb55464f8e34af6d750d06968bf719b8,18183,50400,"Census Tract 504, Whitley County, Indiana",13613.5
4,983badfd7b568728e39a2344a9006078,18001,30200,"Census Tract 302, Adams County, Indiana",11816.666667


In [4]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
pred_data = query_job.to_dataframe()
pred_data.head()

Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44


In [5]:
unemp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16833 entries, 0 to 16832
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   uu_id                   16833 non-null  object
 1   timeperiod              16833 non-null  Int64 
 2   week_number             16833 non-null  Int64 
 3   countyfips              16833 non-null  Int64 
 4   tract                   16833 non-null  Int64 
 5   tract_name              16833 non-null  object
 6   total_claims            16833 non-null  Int64 
 7   edu_8th_or_less         13748 non-null  Int64 
 8   edu_grades_9_11         5942 non-null   Int64 
 9   edu_hs_grad_equiv       6036 non-null   Int64 
 10  edu_post_hs             3246 non-null   Int64 
 11  edu_unknown             12031 non-null  Int64 
 12  top_category_employer1  16833 non-null  object
 13  top_category_employer2  16833 non-null  object
 14  top_category_employer3  16833 non-null  object
 15  ge

In [6]:
unemp_data.isna().sum()

uu_id                         0
timeperiod                    0
week_number                   0
countyfips                    0
tract                         0
tract_name                    0
total_claims                  0
edu_8th_or_less            3085
edu_grades_9_11           10891
edu_hs_grad_equiv         10797
edu_post_hs               13587
edu_unknown                4802
top_category_employer1        0
top_category_employer2        0
top_category_employer3        0
gender_female             12998
gender_male               12902
gender_na                   965
race_amerindian            1548
race_asian                 1894
race_black                 9709
race_noanswer              6744
race_hawaiiannative         407
race_other                 8758
race_white                10441
dtype: int64

In [7]:
unemp_data['top_category_employer2'] = unemp_data['top_category_employer2'].replace('N/A',np.NaN)
unemp_data['top_category_employer3'] = unemp_data['top_category_employer3'].replace('N/A',np.NaN)

In [8]:
(unemp_data.isna().sum()/len(unemp_data))*100

uu_id                      0.000000
timeperiod                 0.000000
week_number                0.000000
countyfips                 0.000000
tract                      0.000000
tract_name                 0.000000
total_claims               0.000000
edu_8th_or_less           18.327096
edu_grades_9_11           64.700291
edu_hs_grad_equiv         64.141864
edu_post_hs               80.716450
edu_unknown               28.527298
top_category_employer1     0.000000
top_category_employer2     0.291095
top_category_employer3     1.627755
gender_female             77.217371
gender_male               76.647062
gender_na                  5.732787
race_amerindian            9.196222
race_asian                11.251708
race_black                57.678370
race_noanswer             40.064160
race_hawaiiannative        2.417870
race_other                52.028753
race_white                62.026971
dtype: float64

In [9]:
unemp_data['race_asian'] = unemp_data.groupby('countyfips')['race_asian'].transform(lambda x: x.fillna(x.mean()))

TypeError: Invalid value '0.12746858168761221' for dtype Int64

In [10]:
unemp_data['race_asian'] = unemp_data.groupby('countyfips')['race_asian'].transform(lambda x: x.fillna(int(x.mean())))

In [11]:
unemp_data['race_black'] = unemp_data.groupby('countyfips')['race_black'].transform(lambda x: x.fillna(int(x.mean())))

In [12]:
unemp_data['race_hawaiiannative'] = unemp_data.groupby('countyfips')['race_hawaiiannative'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_other'] = unemp_data.groupby('countyfips')['race_other'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['race_white'] = unemp_data.groupby('countyfips')['race_white'].transform(lambda x: x.fillna(int(x.mean())))

In [13]:
unemp_data['race_noanswer'] = unemp_data['race_noanswer'].fillna(0)
unemp_data['gender_na'] = unemp_data['gender_na'].fillna(0)
unemp_data['edu_unknown'] = unemp_data['edu_unknown'].fillna(0)

In [14]:
unemp_data['edu_8th_or_less'] = unemp_data.groupby('countyfips')['edu_8th_or_less'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['edu_grades_9_11'] = unemp_data.groupby('countyfips')['edu_grades_9_11'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['edu_hs_grad_equiv'] = unemp_data.groupby('countyfips')['edu_hs_grad_equiv'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['edu_post_hs'] = unemp_data.groupby('countyfips')['edu_post_hs'].transform(lambda x: x.fillna(int(x.mean())))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NAType'

In [15]:
unemp_data.isna().sum()

uu_id                         0
timeperiod                    0
week_number                   0
countyfips                    0
tract                         0
tract_name                    0
total_claims                  0
edu_8th_or_less               0
edu_grades_9_11               0
edu_hs_grad_equiv             0
edu_post_hs               13587
edu_unknown                   0
top_category_employer1        0
top_category_employer2       49
top_category_employer3      274
gender_female             12998
gender_male               12902
gender_na                     0
race_amerindian            1548
race_asian                    0
race_black                    0
race_noanswer                 0
race_hawaiiannative           0
race_other                    0
race_white                    0
dtype: int64

In [16]:
unemp_data['race_noanswer'] = unemp_data['race_noanswer'].fillna(0)
unemp_data['gender_na'] = unemp_data['gender_na'].fillna(0)
unemp_data['edu_unknown'] = unemp_data['edu_unknown'].fillna(0)

In [17]:
unemp_data['race_amerindian'] = unemp_data.groupby('countyfips')['race_amerindian'].transform(lambda x: x.fillna(int(x.mean())))

In [18]:
unemp_data.isna().sum()

uu_id                         0
timeperiod                    0
week_number                   0
countyfips                    0
tract                         0
tract_name                    0
total_claims                  0
edu_8th_or_less               0
edu_grades_9_11               0
edu_hs_grad_equiv             0
edu_post_hs               13587
edu_unknown                   0
top_category_employer1        0
top_category_employer2       49
top_category_employer3      274
gender_female             12998
gender_male               12902
gender_na                     0
race_amerindian               0
race_asian                    0
race_black                    0
race_noanswer                 0
race_hawaiiannative           0
race_other                    0
race_white                    0
dtype: int64

In [19]:
unemp_data['edu_post_hs'] = unemp_data.groupby('countyfips')['edu_post_hs'].transform(lambda x: x.fillna(int(x.mean())))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NAType'

In [20]:
unemp_data.describe()

Unnamed: 0,timeperiod,week_number,countyfips,tract,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white
count,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,3246.0,16833.0,3835.0,3931.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0,16833.0
mean,20220540.093329,21.329531,18088.919682,211605.900315,22.952712,0.065348,1.340462,18.376879,16.923598,0.094339,18.957757,19.689138,0.005228,0.009921,0.012475,11.101705,0.124161,0.0,0.446801,16.257233
std,268.366832,11.618554,41.019467,288715.02492,14.583655,1.03561,3.451816,8.139188,10.889373,1.499568,11.939064,12.093698,0.441002,0.469947,0.535979,12.039616,1.947116,0.0,2.40954,7.670217
min,20220101.0,1.0,18001.0,100.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20220312.0,11.0,18063.0,10100.0,14.0,0.0,0.0,14.0,11.0,0.0,12.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
50%,20220528.0,22.0,18095.0,42801.0,18.0,0.0,0.0,18.0,14.0,0.0,15.0,16.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,15.0
75%,20220806.0,32.0,18103.0,342600.0,27.0,0.0,2.0,21.0,20.0,0.0,22.0,23.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,18.0
max,20220910.0,37.0,18183.0,976400.0,170.0,41.0,78.0,137.0,154.0,60.0,126.0,139.0,49.0,36.0,41.0,163.0,110.0,0.0,74.0,155.0


In [21]:
unemp_data['gender_female'] = unemp_data.groupby('countyfips')['gender_female'].transform(lambda x: x.fillna(int(x.mean())))
unemp_data['gender_male'] = unemp_data.groupby('countyfips')['gender_male'].transform(lambda x: x.fillna(int(x.mean())))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NAType'