In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

In [None]:
get_ipython().run_cell_magic('capture', '', '!pip install google-cloud-bigquery\n!pip install google-cloud-bigquery[pandas]\n')

In [None]:
#- IMPORT THE LIBRARIES YOU WILL USE
#------------------------------------------
# You only need to import packages one time per notebook session. To keep your
# notebook clean and organized you can handle all imports at the top of your file.
# The following are included for example purposed, feel free to modify or delete 
# anything in this section.
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt  
import statsmodels.api as sm
import itertools
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
#query 3: overview of employment_data(week 41)
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
ORDER BY week_number ASC;
"""
query_job = bigquery_client.query(query)
overview = query_job.to_dataframe()
overview.head()

In [None]:
#query 2: overview of important info from unemployment table
query = """
SELECT uu_id, week_number, total_claims
FROM `ironhacks-data.ironhacks_competition.unemployment_data` 
ORDER BY week_number ASC
"""
query_jobb = bigquery_client.query(query)
employ = query_jobb.to_dataframe()
print(employ.head())

In [None]:
#query 3: overview of prediction list (week 41)
query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""
query_job = bigquery_client.query(query)
predn = query_job.to_dataframe()
predn.head()
print(predn.head())

In [None]:
fig = plt.figure()
plt.plot([employ.week_number], [employ.total_claims],'bs')
plt.title('Distribution of claims through week')
plt.xlabel('Weeks')
plt.ylabel('Total claims')
plt.show()

In [None]:
labels = np.array(overview['total_claims'])
features = employ.drop(['uu_id'], axis=1)
feature_list = list(features.columns)
features = np.array(features)

In [None]:
#split data into train and test sets, split first 20% data
x_train, x_test, y_train,y_test = train_test_split(features, labels, test_size = 0.20, random_state = 42)

In [None]:
print(f'Training Features Shape: {x_train.shape}')
print(f'Testing Features Shape: {x_test.shape}')
print(f'Training Labels Shape: {y_train.shape}')
print(f'Testing Labels Shape: {y_test.shape}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=1000, random_state=42)
x = x_train
y = y_train
regressor.fit(x,y) 

In [None]:
#visualizing the decision tree from the regressor
from sklearn import tree
tree.plot_tree(regressor.estimators_[0])

In [None]:
#Shown detailed status of decision tree
from sklearn.datasets import make_regression
x,y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(x, y)

In [None]:
print(regr.predict([[0, 0, 0, 0]]))
tree.plot_tree(regr.estimators_[0])

In [None]:
predictions = regressor.predict(test_features).astype(int)
predictions = np.round(predictions,decimals = 0, out = None)
print(predictions)

In [None]:
predictions = regressor.predict(x_test).astype(int)
predictions = np.round(predictions,decimals = 0, out = None)
print(predictions)

In [None]:
errors = abs(y_test - predictions)
print(f'List of Errors: {errors}')
print(f'Mean Absolute Error: {np.mean(errors)*10:.4f}%')

as the mean absolute error is less than 10%, we could take prediction as final outcome.

In [None]:
df = pd.DataFrame(predictions, columns=['total_claims'])
week41 = predn.join(df).iloc[:,[0,2,1]]
print(week41)
print(f'Total predicting number of unemployment claims of week 41: {sum(predictions):.0f}')

In [None]:
csv_data = week41.to_csv("submission_prediction_output.csv",index=False)

In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n# The following code is used to help our research team understand how you \n# our notebook environment. We do not collect any personal information with\n# the following code, it is used to measure when and how often you work on\n# your submission files.\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")