In [1]:
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

CONFIGURE THE BIGQUERY SETTINGS

In [2]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [3]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [4]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
unemployment_data = query_job.to_dataframe()

QueryJob<project=ironhacks-data, location=US, id=da52b1be-151c-4a7d-ba97-77ffbcad6fbd>


In [5]:
unemployment_data = unemployment_data.drop_duplicates()
unemployment_data.shape

(13977, 25)

In [6]:
k = unemployment_data.copy()

In [7]:
## number of unique ids are matching the number of entries in the wage_data set
import numpy as np
pd.unique(k.uu_id).shape

(525,)

In [8]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [9]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
wage_data = query_job.to_dataframe()

QueryJob<project=ironhacks-data, location=US, id=79c3ae1d-1f8c-4b7e-a06e-c1f8e0ab6726>


In [10]:
wage_data = pd.DataFrame(wage_data)
wage_data.head()

Unnamed: 0,uu_id,countyfips,tract,tract_name,average_wage
0,585f8731c2255d6b3f817a31180848b9,18177,200,"Census Tract 2, Wayne County, Indiana",6612.0
1,8c9d2aa90948679972a9382aadcc6001,18177,900,"Census Tract 9, Wayne County, Indiana",9883.25
2,0f3d45341a5b113b813ffb7be7f58bab,18183,50300,"Census Tract 503, Whitley County, Indiana",13992.25
3,fb55464f8e34af6d750d06968bf719b8,18183,50400,"Census Tract 504, Whitley County, Indiana",13613.5
4,983badfd7b568728e39a2344a9006078,18001,30200,"Census Tract 302, Adams County, Indiana",11816.666667


In [11]:
wage_data = wage_data.drop_duplicates()
wage_data.shape
## no duplicates here!

(525, 5)

In [12]:
pd.unique(wage_data.uu_id).shape

(525,)

In [13]:
## lets join the 2 datasets on uu_id
unemployment_data.columns, wage_data.columns

(Index(['uu_id', 'timeperiod', 'week_number', 'countyfips', 'tract',
        'tract_name', 'total_claims', 'edu_8th_or_less', 'edu_grades_9_11',
        'edu_hs_grad_equiv', 'edu_post_hs', 'edu_unknown',
        'top_category_employer1', 'top_category_employer2',
        'top_category_employer3', 'gender_female', 'gender_male', 'gender_na',
        'race_amerindian', 'race_asian', 'race_black', 'race_noanswer',
        'race_hawaiiannative', 'race_other', 'race_white'],
       dtype='object'),
 Index(['uu_id', 'countyfips', 'tract', 'tract_name', 'average_wage'], dtype='object'))

In [14]:
data=pd.merge(unemployment_data,wage_data, how='inner')
print(data.shape)

(13977, 26)


In [15]:
data.columns

Index(['uu_id', 'timeperiod', 'week_number', 'countyfips', 'tract',
       'tract_name', 'total_claims', 'edu_8th_or_less', 'edu_grades_9_11',
       'edu_hs_grad_equiv', 'edu_post_hs', 'edu_unknown',
       'top_category_employer1', 'top_category_employer2',
       'top_category_employer3', 'gender_female', 'gender_male', 'gender_na',
       'race_amerindian', 'race_asian', 'race_black', 'race_noanswer',
       'race_hawaiiannative', 'race_other', 'race_white', 'average_wage'],
      dtype='object')

In [16]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
data.head()

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,top_category_employer1,top_category_employer2,top_category_employer3,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0.0,,14.0,,0,31-33,51,62,,,0,0,0,11.0,0,0,,,8347.125
1,f013068de98db1470bd986137a0c6d23,20220212,7,18003,900,"Census Tract 9, Allen County, Indiana",10,0.0,0.0,,,0,31-33,23,72,,,0,0,0,,0,0,,,8347.125
2,f013068de98db1470bd986137a0c6d23,20220827,35,18003,900,"Census Tract 9, Allen County, Indiana",11,0.0,,,,0,31-33,23,81,,,0,0,0,,0,0,0.0,,8347.125
3,f013068de98db1470bd986137a0c6d23,20220312,11,18003,900,"Census Tract 9, Allen County, Indiana",11,,,,,0,56,23,44-45,,,0,0,0,,0,0,,,8347.125
4,f013068de98db1470bd986137a0c6d23,20220205,6,18003,900,"Census Tract 9, Allen County, Indiana",11,,,,,0,56,23,72,,,0,0,0,,0,0,0.0,,8347.125


In [17]:
data.isna().sum()

uu_id                         0
timeperiod                    0
week_number                   0
countyfips                    0
tract                         0
tract_name                    0
total_claims                  0
edu_8th_or_less            2532
edu_grades_9_11            9084
edu_hs_grad_equiv          8895
edu_post_hs               11305
edu_unknown                3922
top_category_employer1        0
top_category_employer2        0
top_category_employer3        0
gender_female             10800
gender_male               10703
gender_na                   784
race_amerindian            1275
race_asian                 1552
race_black                 8044
race_noanswer              5469
race_hawaiiannative         335
race_other                 7213
race_white                 8693
average_wage                  0
dtype: int64

df = pd.DataFrame()
df[['Value1', 'Value2']] = data['top_category_employer1'].str.split('-', 1, expand=True)
## replace the null values by the value before hypen
df['Value2'].fillna(df['Value1'],inplace=True)

df['Value1'] = pd.to_numeric(df['Value1'])
df['Value2'] = pd.to_numeric(df['Value2'])

df['Value3'] = (df['Value1']+df['Value2'])//2

In [18]:
def breakcolumn(a,data):
    df=pd.DataFrame()
    df[['Value1', 'Value2']] = data[a].str.split('-', 1, expand=True)
    ## replace the null values by the value before hypen
    df['Value2'].fillna(df['Value1'],inplace=True)

    df['Value1'] = pd.to_numeric(df['Value1'])
    df['Value2'] = pd.to_numeric(df['Value2'])

    df['Value3'] = (df['Value1']+df['Value2'])//2
    data[a] = df['Value3']

In [19]:
data1 = data.copy()
obj_list = ['top_category_employer1','top_category_employer2','top_category_employer3']
for i in obj_list:
    data1[i].replace('N/A',0,inplace=True)
    breakcolumn(i,data1)

  df[['Value1', 'Value2']] = data[a].str.split('-', 1, expand=True)
  df[['Value1', 'Value2']] = data[a].str.split('-', 1, expand=True)
  df[['Value1', 'Value2']] = data[a].str.split('-', 1, expand=True)


In [20]:
data1.head()    

Unnamed: 0,uu_id,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_grades_9_11,edu_hs_grad_equiv,edu_post_hs,edu_unknown,top_category_employer1,top_category_employer2,top_category_employer3,gender_female,gender_male,gender_na,race_amerindian,race_asian,race_black,race_noanswer,race_hawaiiannative,race_other,race_white,average_wage
0,f013068de98db1470bd986137a0c6d23,20220416,16,18003,900,"Census Tract 9, Allen County, Indiana",22,0.0,,14.0,,0,32,51.0,62.0,,,0,0,0,11.0,0,0,,,8347.125
1,f013068de98db1470bd986137a0c6d23,20220212,7,18003,900,"Census Tract 9, Allen County, Indiana",10,0.0,0.0,,,0,32,23.0,72.0,,,0,0,0,,0,0,,,8347.125
2,f013068de98db1470bd986137a0c6d23,20220827,35,18003,900,"Census Tract 9, Allen County, Indiana",11,0.0,,,,0,32,23.0,81.0,,,0,0,0,,0,0,0.0,,8347.125
3,f013068de98db1470bd986137a0c6d23,20220312,11,18003,900,"Census Tract 9, Allen County, Indiana",11,,,,,0,56,23.0,44.0,,,0,0,0,,0,0,,,8347.125
4,f013068de98db1470bd986137a0c6d23,20220205,6,18003,900,"Census Tract 9, Allen County, Indiana",11,,,,,0,56,23.0,72.0,,,0,0,0,,0,0,0.0,,8347.125


In [21]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13977 entries, 0 to 13976
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   13977 non-null  object 
 1   timeperiod              13977 non-null  Int64  
 2   week_number             13977 non-null  Int64  
 3   countyfips              13977 non-null  Int64  
 4   tract                   13977 non-null  Int64  
 5   tract_name              13977 non-null  object 
 6   total_claims            13977 non-null  Int64  
 7   edu_8th_or_less         11445 non-null  Int64  
 8   edu_grades_9_11         4893 non-null   Int64  
 9   edu_hs_grad_equiv       5082 non-null   Int64  
 10  edu_post_hs             2672 non-null   Int64  
 11  edu_unknown             10055 non-null  Int64  
 12  top_category_employer1  13977 non-null  int64  
 13  top_category_employer2  13945 non-null  float64
 14  top_category_employer3  13775 non-null

In [22]:
data1['race_black'].fillna(0,inplace=True)
data1['race_other'].fillna(0,inplace=True)
data1['club_races'] = data1['race_black'] + data1['race_other']
data1.drop(['race_black','race_other'],axis=1,inplace=True)

In [23]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13977 entries, 0 to 13976
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   13977 non-null  object 
 1   timeperiod              13977 non-null  Int64  
 2   week_number             13977 non-null  Int64  
 3   countyfips              13977 non-null  Int64  
 4   tract                   13977 non-null  Int64  
 5   tract_name              13977 non-null  object 
 6   total_claims            13977 non-null  Int64  
 7   edu_8th_or_less         11445 non-null  Int64  
 8   edu_grades_9_11         4893 non-null   Int64  
 9   edu_hs_grad_equiv       5082 non-null   Int64  
 10  edu_post_hs             2672 non-null   Int64  
 11  edu_unknown             10055 non-null  Int64  
 12  top_category_employer1  13977 non-null  int64  
 13  top_category_employer2  13945 non-null  float64
 14  top_category_employer3  13775 non-null

In [24]:
data1.drop(['gender_male','gender_male','race_white','edu_grades_9_11','edu_hs_grad_equiv','edu_post_hs'],axis=1,inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13977 entries, 0 to 13976
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   13977 non-null  object 
 1   timeperiod              13977 non-null  Int64  
 2   week_number             13977 non-null  Int64  
 3   countyfips              13977 non-null  Int64  
 4   tract                   13977 non-null  Int64  
 5   tract_name              13977 non-null  object 
 6   total_claims            13977 non-null  Int64  
 7   edu_8th_or_less         11445 non-null  Int64  
 8   edu_unknown             10055 non-null  Int64  
 9   top_category_employer1  13977 non-null  int64  
 10  top_category_employer2  13945 non-null  float64
 11  top_category_employer3  13775 non-null  float64
 12  gender_female           3177 non-null   Int64  
 13  gender_na               13193 non-null  Int64  
 14  race_amerindian         12702 non-null

In [25]:
data1.fillna(method='bfill',inplace=True)
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13977 entries, 0 to 13976
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   13977 non-null  object 
 1   timeperiod              13977 non-null  Int64  
 2   week_number             13977 non-null  Int64  
 3   countyfips              13977 non-null  Int64  
 4   tract                   13977 non-null  Int64  
 5   tract_name              13977 non-null  object 
 6   total_claims            13977 non-null  Int64  
 7   edu_8th_or_less         13974 non-null  Int64  
 8   edu_unknown             13976 non-null  Int64  
 9   top_category_employer1  13977 non-null  int64  
 10  top_category_employer2  13977 non-null  float64
 11  top_category_employer3  13977 non-null  float64
 12  gender_female           13970 non-null  Int64  
 13  gender_na               13977 non-null  Int64  
 14  race_amerindian         13977 non-null

In [26]:
data1['race_asian'] = data1['race_asian'].fillna(int(np.mean(data1['race_asian'])))
data1['race_noanswer'] = data1['race_noanswer'].fillna(int(np.mean(data1['race_noanswer'])))
data1['edu_unknown'] = data1['edu_unknown'].fillna(int(np.mean(data1['edu_unknown'])))
data1['gender_female'] = data1['gender_female'].fillna(int(np.mean(data1['gender_female'])))
data1['top_category_employer3'] = data1['top_category_employer3'].fillna(int(np.mean(data1['top_category_employer3'])))

In [27]:
from sklearn import preprocessing
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'Country'. 
data1['tract_name']= label_encoder.fit_transform(data1['tract_name']) 

In [28]:
data2 = data1.copy()
data1['uu_id']= label_encoder.fit_transform(data1['uu_id']) 

In [29]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13977 entries, 0 to 13976
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   uu_id                   13977 non-null  int64  
 1   timeperiod              13977 non-null  Int64  
 2   week_number             13977 non-null  Int64  
 3   countyfips              13977 non-null  Int64  
 4   tract                   13977 non-null  Int64  
 5   tract_name              13977 non-null  int64  
 6   total_claims            13977 non-null  Int64  
 7   edu_8th_or_less         13974 non-null  Int64  
 8   edu_unknown             13977 non-null  Int64  
 9   top_category_employer1  13977 non-null  int64  
 10  top_category_employer2  13977 non-null  float64
 11  top_category_employer3  13977 non-null  float64
 12  gender_female           13977 non-null  Int64  
 13  gender_na               13977 non-null  Int64  
 14  race_amerindian         13977 non-null

In [30]:
X = data1.drop('total_claims',axis=1)
y = data1['total_claims']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 42)

In [31]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [32]:
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

NameError: name 'y_pred' is not defined

In [33]:
y_pred

NameError: name 'y_pred' is not defined

In [34]:
## MAPE function
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

In [35]:
print(MAPE(y_test,y_pred))

NameError: name 'y_pred' is not defined

In [36]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 500, random_state = 0)
rfr.fit(X_train, y_train)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [37]:
y_pred = rfr.predict(X_test)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [38]:
y_pred = np.round(y_pred)

NameError: name 'y_pred' is not defined

In [39]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

NameError: name 'y_pred' is not defined

In [40]:
X2 = X.copy()

In [41]:
X2 = X2.apply(lambda iterator: ((iterator - iterator.mean())/iterator.std()).round(2))

In [42]:
X_train,X_test,y_train,y_test = train_test_split(X2,y,test_size = 0.25,random_state = 42)

In [43]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [44]:
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

NameError: name 'y_pred' is not defined

In [45]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 600, random_state = 0)
rfr.fit(X_train, y_train)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [46]:
y_pred = rfr.predict(X_test)
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [47]:
y_pred

NameError: name 'y_pred' is not defined

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,5,4,25,50],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train,y_train)

grid_search.best_params_
y_pred = grid_search.predict(X_test)
mean_squared_error(y_test,y_pred)

In [48]:
get_ipython().system('pip install xgboost')
from xgboost.sklearn import XGBRegressor
regressor = XGBRegressor(
    n_estimators=500,
    reg_lambda=1,
    gamma=0,
    max_depth=3)







In [49]:
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
mean_squared_error(y_test,y_pred)

106.23642622278886

In [50]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [51]:
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [400,500,600,100]}

In [52]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

In [53]:
xgb_grid.fit(X_train,y_train)
y_pred = xgb_grid.predict(X_test)
mean_squared_error(y_test,y_pred)

Fitting 2 folds for each of 36 candidates, totalling 72 fits




99.10776048214952

In [54]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [55]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
prediction = query_job.to_dataframe()

QueryJob<project=ironhacks-data, location=US, id=5b6a2689-2c4c-4ccf-af53-a0c8d13bbd27>


In [56]:
print(prediction.shape)
pd.DataFrame(prediction).head()

(525, 2)


Unnamed: 0,uu_id,week_number
0,5bf51fc2e162d6faf9e3cf79e4198378,44
1,420b44cc7e3f55d738df565421e59941,44
2,e39c66ecceec76ee8f9f811fa4a2d246,44
3,a90462cd11ae4e43144239bf7c4828a4,44
4,8b20a6749088c7ff1237983076ebfeaa,44


In [57]:
data2 = data2.drop_duplicates(subset=['uu_id'],keep='last')

In [58]:
data2 = data2.set_index('uu_id')
data2.head()

Unnamed: 0_level_0,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_unknown,top_category_employer1,top_category_employer2,top_category_employer3,gender_female,gender_na,race_amerindian,race_asian,race_noanswer,race_hawaiiannative,average_wage,club_races
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
f013068de98db1470bd986137a0c6d23,20220514,20,18003,900,451,24,0,0,32,44.0,71.0,76,0,0,0,19,0,8347.125,0
21957d5517323845818d87623589e1ba,20220219,8,18089,10400,28,26,0,0,72,48.0,44.0,15,0,0,0,0,0,7036.636364,21
6a5609f385912113b6f1014b958ed748,20220507,19,18089,11500,90,20,0,0,56,32.0,72.0,11,0,0,0,0,0,7890.142857,0
46b2882ec4c373527ec33f7bd4f1388d,20220319,12,18089,20700,163,82,0,0,72,48.0,44.0,26,0,0,0,0,0,7534.375,75
37495d17e82f7df326bfc2c4c090f7b7,20220514,20,18089,21900,180,16,0,0,32,48.0,44.0,14,0,0,0,0,0,11825.125,0


In [59]:
final_prediction = data2.join(prediction.set_index('uu_id'),on='uu_id',rsuffix='_other')
final_prediction.head()

Unnamed: 0_level_0,timeperiod,week_number,countyfips,tract,tract_name,total_claims,edu_8th_or_less,edu_unknown,top_category_employer1,top_category_employer2,top_category_employer3,gender_female,gender_na,race_amerindian,race_asian,race_noanswer,race_hawaiiannative,average_wage,club_races,week_number_other
uu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
f013068de98db1470bd986137a0c6d23,20220514,20,18003,900,451,24,0,0,32,44.0,71.0,76,0,0,0,19,0,8347.125,0,44
21957d5517323845818d87623589e1ba,20220219,8,18089,10400,28,26,0,0,72,48.0,44.0,15,0,0,0,0,0,7036.636364,21,44
6a5609f385912113b6f1014b958ed748,20220507,19,18089,11500,90,20,0,0,56,32.0,72.0,11,0,0,0,0,0,7890.142857,0,44
46b2882ec4c373527ec33f7bd4f1388d,20220319,12,18089,20700,163,82,0,0,72,48.0,44.0,26,0,0,0,0,0,7534.375,75,44
37495d17e82f7df326bfc2c4c090f7b7,20220514,20,18089,21900,180,16,0,0,32,48.0,44.0,14,0,0,0,0,0,11825.125,0,44


In [60]:
final_prediction_data = pd.DataFrame()
final_prediction_data['index'] = final_prediction.index
final_prediction_data['week_number_other'] = final_prediction.week_number_other

In [61]:
final_prediction = final_prediction.drop(['week_number_other'], axis=1)
final_prediction.reset_index(drop=True, inplace=True)

In [62]:
future = final_prediction.values
future_weeks_pred = rfr.predict(future)
print(future_weeks_pred.shape)



TypeError: float() argument must be a string or a number, not 'NAType'

In [63]:
prediction['total_claims'] = future_weeks_pred.astype('int')
prediction.columns = ['uuid','week','count']
print(prediction)

NameError: name 'future_weeks_pred' is not defined

In [64]:
prediction.to_csv("submission_prediction_output.csv",index=False)

In [65]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [66]:
pip install db_dtypes
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

SyntaxError: invalid syntax (2755780900.py, line 1)

In [67]:
get_ipython().system('pip install db_dtypes')
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics





