In [None]:
get_ipython().run_cell_magic('capture', '', "%logstop\n%logstart -t -r -q ipython_command_log.py global\n\n#- IRONHACKS RESEARCH TRACKING CODE\n#----------------------------------\n\nimport os\nfrom datetime import datetime\nimport IPython.core.history as history\n\nha = history.HistoryAccessor()\nha_tail = ha.get_tail(1)\nha_cmd = next(ha_tail)\nsession_id = str(ha_cmd[0])\ncommand_id = str(ha_cmd[1])\ntimestamp = datetime.utcnow().isoformat()\nhistory_line = ','.join([session_id, command_id, timestamp]) + '\\n'\nlogfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')\nlogfile.write(history_line)\nlogfile.close()\n")

- LIBRARIES
------------------------------------------
DATA PROCESSING
import pandas, google-cloud-bigquery, chardet, click, cytoolz, dask, decorator, pyjson5, jsonschema, tables

STATISTICAL MODELING
import numpy, scipy, statsmodels, statistics, scikit-learn, patsy, simpy

DATA VISUALIZATION
import plotly, bokeh, seaborn, matplotlib, vincent

In [None]:
import csv
import pandas as pd
import numpy as np

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt 

- PROJECT INFORMATION
------------------------------------------
These details for the Ironhack project are needed for the Query client

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

- DEFINE YOUR CLASSES AND FUNCTIONS 
----------------------------------

QUERY FUNCTION

In [None]:
def query_from_statement(query):
    query_job = bigquery_client.query(query)  # BIGQUERY 
    df = query_job.to_dataframe() # TURNING INTO PANDAS DF
    #display(df.tail(n=5))
    #print(df.shape)
    return df

In [None]:
u_claims_query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
Where week_number between 1 and 37
order by week_number
"""

In [None]:
unemployment_claims_data = query_from_statement(u_claims_query)

In [None]:
wage_query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [None]:
unemployment_wage_data = query_from_statement(wage_query)

In [None]:
duplicated_rows = sum(unemployment_claims_data.duplicated()) # CHECKING FOR DUPLICATED ROWS
# THERE ARE 3079 DUPLICATED ROWS BETWEEN WEEKS 1 and 37
unemployment_claims_data = unemployment_claims_data.drop_duplicates()

In [None]:
duplicated_rows = sum(unemployment_wage_data.duplicated()) # CHECKING FOR DUPLICATED ROWS
# THERE ARE NO DUPLICATES

In [None]:
print(unemployment_claims_data.isna().sum())
print(unemployment_wage_data.isna().sum())

In [None]:
unemployment_claims_data = unemployment_claims_data.fillna(0)
unemployment_wage_data = unemployment_wage_data.fillna(0)

In [None]:
data = unemployment_claims_data.join(unemployment_wage_data.set_index('uu_id'), on='uu_id',  rsuffix='_other') # JOINING 
data = data.drop(['countyfips_other', 'tract_other','tract_name_other'], axis=1) # REMOVING REPEATED COLUMNS IN BOTH TABLES

In [None]:
data = data.drop(['tract_name', 'top_category_employer1', 'top_category_employer2',
                  'top_category_employer3'], axis=1)
print(data.shape)
display(data.tail(n=5))

In [None]:
plt.figure(figsize=(8,6))
cor = data.corr().round(2)
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds, annot_kws={"size": 6})
plt.show()

In [None]:
data = data.drop(['uu_id','timeperiod'], axis=1)

In [None]:
y = np.array(data['total_claims'].values).reshape(-1,1)

In [None]:
data = data.drop(['total_claims'], axis=1)
X = data.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(f'Training Features Shape: {X_train.shape}')
print(f'Testing Features Shape: {X_test.shape}')
print(f'Training Labels Shape: {y_train.shape}')
print(f'Testing Labels Shape: {y_test.shape}')

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train)
print(regressor.intercept_)
print(regressor.coef_)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
plt.scatter(y_test, y_pred,  color='gray')
plt.show()

In [None]:
metrics.mean_absolute_percentage_error(y_test, y_pred)

In [None]:
prediction_query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""

In [None]:
unemployment_prediction_data = query_from_statement(prediction_query)
print(unemployment_prediction_data.shape)
display(unemployment_prediction_data.head(n=5))

In [None]:
unemployclaims_supplemental_query = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
Where week_number between 1 and 37
order by week_number
"""

In [None]:
unemployclaims_supplemental_data = query_from_statement(unemployclaims_supplemental_query)

In [None]:
# APPLYING ALL TRANSFORMATIONS TO THE LATEST WEEK ONLY DATAFRAME
unemployclaims_supplemental_data = unemployclaims_supplemental_data.drop_duplicates(subset=['uu_id'], keep='last')

In [None]:
unemployclaims_supplemental_data = unemployclaims_supplemental_data.join(unemployment_wage_data.set_index('uu_id'), on='uu_id',  rsuffix='_other') # JOINING
unemployclaims_supplemental_data = unemployclaims_supplemental_data.drop(['countyfips_other', 'tract_other','tract_name_other'], axis=1) # REMOVING REPEATED COLUMNS IN BOTH TABLES
unemployclaims_supplemental_data = unemployclaims_supplemental_data.drop(['tract_name', 'top_category_employer1', 'top_category_employer2', 'top_category_employer3'], axis=1)
unemployclaims_supplemental_data = unemployclaims_supplemental_data.drop(['timeperiod'], axis=1)
unemployclaims_supplemental_data = unemployclaims_supplemental_data.drop(['total_claims'], axis=1)

In [None]:
unemployclaims_supplemental_data = unemployclaims_supplemental_data.fillna(0)

In [None]:
print(unemployclaims_supplemental_data.shape)
display(unemployclaims_supplemental_data.head(n=5))

In [None]:
final_prediction_data = unemployment_prediction_data.join(unemployclaims_supplemental_data.set_index('uu_id'), on='uu_id',  rsuffix='_other') # JOINING
final_prediction_data = final_prediction_data.drop(['uu_id', 'week_number_other'], axis=1)
print(final_prediction_data.shape)

In [None]:
future_regressor = LinearRegression()  
future_regressor.fit(X, y)
print(future_regressor.intercept_)
print(future_regressor.coef_)

In [None]:
future = final_prediction_data.values
future_weeks_pred = future_regressor.predict(future)
print(future_weeks_pred.shape)

In [None]:
unemployment_prediction_data['total_claims'] = future_weeks_pred.astype(int)
display(unemployment_prediction_data)

In [None]:
unemployment_prediction_data.to_csv("submission_prediction_output.csv",index=False)