In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n\n!python3 -m pip install pandas\n!pip install db-dtypes\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from statsmodels.formula.api import ols
from pandas import Series, DataFrame

CONFIGURE THE BIGQUERY SETTINGS

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [None]:
query_job = bigquery_client.query(query)
unemployment_data = query_job.to_dataframe()
unemployment_data.head()

In [None]:
query2 = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [None]:
query_job2 = bigquery_client.query(query2)
wage_data = query_job2.to_dataframe()
wage_data.head()

In [None]:
wage_data[wage_data.isnull().any(axis=1)]

In [None]:
wage_data2 = wage_data.fillna({'average_wage': wage_data.average_wage.mean()})
wage_data2.head()

In [None]:
query3 = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
query_job3 = bigquery_client.query(query3)
prediction_list = query_job3.to_dataframe()
prediction_list.head()

In [None]:
unemploy_wage_data = pd.merge(unemployment_data, wage_data, on=['uu_id'], how='inner')
unemploy_wage_data = unemploy_wage_data.drop(['timeperiod', 'countyfips_y', 'tract_y', 'tract_name_y'], axis=1)
unemploy_wage_data = unemploy_wage_data.fillna(0)
unemploy_wage_data.head()

In [None]:
unemploy_wage_data.describe()

In [None]:
sns.relplot(data=unemploy_wage_data, x='week_number', y='total_claims')

In [None]:
sns.distplot(unemploy_wage_data.total_claims, bins=10)

In [None]:
plt.figure(figsize=(16,14))
cor = unemploy_wage_data.corr()
cmap = sns.diverging_palette(210, 20, as_cmap=True)
sns.heatmap(cor, cmap=cmap, vmax=.99, vmin=-.99, annot=True)

In [None]:
X = unemploy_wage_data[['week_number', 'countyfips_x', 'tract_x', 'edu_8th_or_less', 'edu_grades_9_11', \
                        'edu_hs_grad_equiv', 'edu_post_hs', 'edu_unknown', 'gender_female', 'gender_male', \
                        'gender_na', 'race_amerindian', 'race_asian', 'race_black', 'race_noanswer', \
                        'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']]
y = unemploy_wage_data['total_claims']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
reg = LinearRegression()  
reg.fit(X_train, y_train)
print(reg)

In [None]:
print(f'intercept: {reg.intercept_}')
coef = DataFrame(reg.coef_, X.columns, columns=['coefficients'])
print(coef)

In [None]:
y_pred = reg.predict(X_test)
df = DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

In [None]:
df['Predicted'].mean()

In [None]:
print('R squared: {:.2f}'.format(reg.score(X, y)*100))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
#Make prediction
prediction_data = pd.merge(unemploy_wage_data, prediction_list, on=['uu_id'], how='inner')
prediction_data = prediction_data.drop(['week_number_x','total_claims'],axis=1)
prediction_data = prediction_data.groupby(['uu_id']).mean()
prediction_data

In [None]:
final_reg = LinearRegression()  
final_reg.fit(X, y)
final_X = prediction_data[['week_number_y', 'countyfips_x', 'tract_x', 'edu_8th_or_less', 'edu_grades_9_11', \
                           'edu_hs_grad_equiv', 'edu_post_hs', 'edu_unknown', 'gender_female', 'gender_male', \
                           'gender_na', 'race_amerindian', 'race_asian', 'race_black', 'race_noanswer', \
                           'race_hawaiiannative', 'race_other', 'race_white', 'average_wage']]
pred_claims = final_reg.predict(final_X)
prediction_list['total_claims'] = pred_claims.astype(int)
prediction_list

In [None]:
prediction_list.to_csv('submission_prediction_output.csv', index=False)