In [None]:
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

CONFIGURE THE BIGQUERY SETTINGS

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [None]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
unemployment_data = query_job.to_dataframe()

In [None]:
unemployment_data = unemployment_data.drop_duplicates()
unemployment_data.shape

In [None]:
k = unemployment_data.copy()

In [None]:
## number of unique ids are matching the number of entries in the wage_data set
import numpy as np
pd.unique(k.uu_id).shape

In [None]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [None]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
wage_data = query_job.to_dataframe()

In [None]:
wage_data = pd.DataFrame(wage_data)
wage_data.head()

In [None]:
wage_data = wage_data.drop_duplicates()
wage_data.shape
## no duplicates here!

In [None]:
pd.unique(wage_data.uu_id).shape

In [None]:
## lets join the 2 datasets on uu_id
unemployment_data.columns, wage_data.columns

In [None]:
data=pd.merge(unemployment_data,wage_data, how='inner')
print(data.shape)

In [None]:
data.columns

In [None]:
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
data.head()

In [None]:
data.isna().sum()

df = pd.DataFrame()
df[['Value1', 'Value2']] = data['top_category_employer1'].str.split('-', 1, expand=True)
## replace the null values by the value before hypen
df['Value2'].fillna(df['Value1'],inplace=True)

df['Value1'] = pd.to_numeric(df['Value1'])
df['Value2'] = pd.to_numeric(df['Value2'])

df['Value3'] = (df['Value1']+df['Value2'])//2

In [None]:
def breakcolumn(a,data):
    df=pd.DataFrame()
    df[['Value1', 'Value2']] = data[a].str.split('-', 1, expand=True)
    ## replace the null values by the value before hypen
    df['Value2'].fillna(df['Value1'],inplace=True)

    df['Value1'] = pd.to_numeric(df['Value1'])
    df['Value2'] = pd.to_numeric(df['Value2'])

    df['Value3'] = (df['Value1']+df['Value2'])//2
    data[a] = df['Value3']

In [None]:
data1 = data.copy()
obj_list = ['top_category_employer1','top_category_employer2','top_category_employer3']
for i in obj_list:
    data1[i].replace('N/A',0,inplace=True)
    breakcolumn(i,data1)

In [None]:
data1.head()    

In [None]:
data1.info()

In [None]:
data1['race_black'].fillna(0,inplace=True)
data1['race_other'].fillna(0,inplace=True)
data1['club_races'] = data1['race_black'] + data1['race_other']
data1.drop(['race_black','race_other'],axis=1,inplace=True)

In [None]:
data1.info()

In [None]:
data1.drop(['gender_male','gender_male','race_white','edu_grades_9_11','edu_hs_grad_equiv','edu_post_hs'],axis=1,inplace=True)
data1.info()

In [None]:
data1.fillna(method='bfill',inplace=True)
data1.info()

In [None]:
data1['race_asian'] = data1['race_asian'].fillna(int(np.mean(data1['race_asian'])))
data1['race_noanswer'] = data1['race_noanswer'].fillna(int(np.mean(data1['race_noanswer'])))
data1['edu_unknown'] = data1['edu_unknown'].fillna(int(np.mean(data1['edu_unknown'])))
data1['gender_female'] = data1['gender_female'].fillna(int(np.mean(data1['gender_female'])))
data1['top_category_employer3'] = data1['top_category_employer3'].fillna(int(np.mean(data1['top_category_employer3'])))

In [None]:
from sklearn import preprocessing
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'Country'. 
data1['tract_name']= label_encoder.fit_transform(data1['tract_name']) 

In [None]:
data2 = data1.copy()
data1['uu_id']= label_encoder.fit_transform(data1['uu_id']) 

In [None]:
data1.info()

In [None]:
X = data1.drop('total_claims',axis=1)
y = data1['total_claims']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)

In [None]:
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

In [None]:
y_pred

In [None]:
## MAPE function
def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

In [None]:
print(MAPE(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 500, random_state = 0)
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)

In [None]:
y_pred = np.round(y_pred)

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

In [None]:
X2 = X.copy()

In [None]:
X2 = X2.apply(lambda iterator: ((iterator - iterator.mean())/iterator.std()).round(2))

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X2,y,test_size = 0.25,random_state = 42)

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)

In [None]:
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators = 600, random_state = 0)
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)
y_pred = np.round(y_pred)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

In [None]:
y_pred

from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [10,20,5,4,25,50],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train,y_train)

grid_search.best_params_
y_pred = grid_search.predict(X_test)
mean_squared_error(y_test,y_pred)

In [None]:
get_ipython().system('pip install xgboost')
from xgboost.sklearn import XGBRegressor
regressor = XGBRegressor(
    n_estimators=500,
    reg_lambda=1,
    gamma=0,
    max_depth=3)

In [None]:
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
mean_squared_error(y_test,y_pred)

In [None]:
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [400,500,600,100]}

In [None]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

In [None]:
xgb_grid.fit(X_train,y_train)
y_pred = xgb_grid.predict(X_test)
mean_squared_error(y_test,y_pred)

In [None]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
print(query_job)
prediction = query_job.to_dataframe()

In [None]:
print(prediction.shape)
pd.DataFrame(prediction).head()

In [None]:
data2 = data2.drop_duplicates(subset=['uu_id'],keep='last')

In [None]:
data2 = data2.set_index('uu_id')
data2.head()

In [None]:
final_prediction = data2.join(prediction.set_index('uu_id'),on='uu_id',rsuffix='_other')
final_prediction.head()

In [None]:
final_prediction_data = pd.DataFrame()
final_prediction_data['index'] = final_prediction.index
final_prediction_data['week_number_other'] = final_prediction.week_number_other

In [None]:
final_prediction = final_prediction.drop(['week_number_other'], axis=1)
final_prediction.reset_index(drop=True, inplace=True)

In [None]:
future = final_prediction.values
future_weeks_pred = rfr.predict(future)
print(future_weeks_pred.shape)

In [None]:
prediction['total_claims'] = future_weeks_pred.astype('int')
prediction.columns = ['uuid','week','count']
print(prediction)

In [None]:
prediction.to_csv("submission_prediction_output.csv",index=False)

In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
pip install db_dtypes
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

In [None]:
get_ipython().system('pip install db_dtypes')
import os
import pandas as pd
import db_dtypes
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics