In [None]:
get_ipython().run_cell_magic('capture', '', "\n#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED\n#------------------------------------------\n# This is normally not required. The hub environment comes preinstaled with \n# many packages that you can already use without setup. In case there is some\n# other library you would like to use that isn't on the list you run this command\n# once to install them.  If it is already installed this command has no effect.\n\n!python3 -m pip install pandas\n!pip install db-dtypes\n")

- IMPORT THE LIBRARIES YOU WILL USE
------------------------------------------
You only need to import packages one time per notebook session. To keep your
notebook clean and organized you can handle all imports at the top of your file.
The following are included for example purposed, feel free to modify or delete 
anything in this section.

In [None]:
import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import datetime,itertools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

- DEFINE YOUR CLASSES AND FUNCTIONS 
-----------------------------------
This is not required, but is helpful in keeping your notebook organized. 
You can use the following cell or several cells to define your functions
and classes to keep them separate from your analysis or results code.
In general it useful to define your methods in a separate cell from where
it is run.

In [None]:
def example_function():
    print('Hello World')

In [None]:
def print_column_info(df):
    print(f'No. of columns: {len(df.columns)}')
    for col in df.columns:
        print(len(df[col].unique()),col,df[col].dtypes)
    print()

In [None]:
def print_na_info(df):
    for col in df.columns:
        print(df[col].isnull().sum(),col,df[col].dtypes)

In [None]:
def get_datetime(week_no):
    date = datetime.datetime.strptime("2022-"+str(week_no)+"-1","%Y-%W-%w")
    #print(date)
    return pd.to_datetime(date,format="%Y-%m-%d")

In [None]:
def sarimax_gridsearch(ts, pdq, pdqs, maxiter=100, freq='D',disp=False):
    # Run a grid search with pdq and seasonal pdq parameters and get the best BIC value
    ans = []
    for comb in pdq:
        for combs in pdqs:
            #try:
            mod = SARIMAX(ts,order=comb,
                          seasonal_order=combs,
                          enforce_stationarity=False,
                          enforce_invertibility=False)

            output = mod.fit(maxiter=maxiter,disp=False) 
            ans.append([comb, combs, output.bic])
            #print('SARIMAX {} x {}12 : BIC Calculated ={}'.format(comb, combs, output.bic))
            #except:
            #    continue
    ans_df = pd.DataFrame(ans, columns=['pdq', 'pdqs', 'bic'])
    ans_df = ans_df.sort_values(by=['bic'],ascending=True)[0:5]
    
    return ans_df

In [None]:
def check_stationary(data,Print=0):
    adft = adfuller(data,autolag="BIC")
    output_df = pd.DataFrame({"Values":[adft[0],adft[1],adft[2],adft[3], adft[4]['1%'], adft[4]['5%'], adft[4]['10%']]  , "Metric":["Test Statistics","p-value","No. of lags used","Number of observations used", 
                                                            "critical value (1%)", "critical value (5%)", "critical value (10%)"]})

    critical_value = adft[4]['5%']
    if Print==1:
        print(output_df)
    if adft[1] < 0.05 and adft[0] < critical_value:
        return 1
    else:
        return 0

In [None]:
def diff_inv(series, last_observation):

    series_undifferenced = series.copy()

    series_undifferenced.iat[0] = series_undifferenced.iat[0] + last_observation

    series_undifferenced = series_undifferenced.cumsum()

    return series_undifferenced

In [None]:
def loss(pred,actual):
    pred = np.round(pred)
    errors = abs(actual-pred)
    print(f'Mean Absolute Error: {round(np.mean(errors), 2)}')
    print(f'Mean squared Error: {round(np.mean(errors**2), 2)}')
    mape = 100 * (errors/actual)
    # Calcualte and display accuracy
    accuracy = 100 - np.mean(mape)
    print(f'Accuracy: {round(accuracy, 2)}%.')

BIGQuery

In [None]:
BIGQUERY_PROJECT = 'ironhacks-data'
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [None]:
#query = """
#SELECT 
#week_number,
#cases 
#FROM `ironhacks-data.ironhacks_training.covid19_cases`
#Where week_number between 1 and 3
#order by week_number
#"""
print("Datasets available:")
for dataset in list(bigquery_client.list_datasets()):
    print(dataset.dataset_id)
    if dataset.dataset_id == "ironhacks_competition":
        mydataset = dataset

In [None]:
print("\nTables available:")
for table in bigquery_client.list_tables("ironhacks_competition"):
    print(table.table_id)

In [None]:
query1 = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.prediction_list`
"""
query2 = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.unemployment_data`
"""

In [None]:
query3 = """
SELECT *
FROM `ironhacks-data.ironhacks_competition.wage_data`
"""

In [None]:
query_job1 = bigquery_client.query(query1)
query_job2 = bigquery_client.query(query2)
query_job3 = bigquery_client.query(query3)

In [None]:
prediction_data = query_job1.to_dataframe()
unemployment_data = query_job2.to_dataframe()
wage_data = query_job3.to_dataframe()

In [None]:
##Dumping the df to csv
week = "week2"
prediction_data.to_csv("prediction_data_"+week+".csv",index=False)
unemployment_data.to_csv("unemployment_data_"+week+".csv",index=False)
wage_data.to_csv("wage_data_"+week+".csv",index=False)

In [None]:
merged_data = pd.merge(unemployment_data,wage_data[["uu_id","average_wage"]],on="uu_id")
merged_data = merged_data.drop_duplicates()

In [None]:
query4 = """
SELECT table_id,
DATE(TIMESTAMP_MILLIS(creation_time)) AS creation_date,
DATE(TIMESTAMP_MILLIS(last_modified_time)) AS last_modified_date,
row_count,
size_bytes,
CASE
    WHEN type = 1 THEN 'table'
    WHEN type = 2 THEN 'view'
    WHEN type = 3 THEN 'external'
    ELSE '?'
END AS type,
TIMESTAMP_MILLIS(creation_time) AS creation_time,
TIMESTAMP_MILLIS(last_modified_time) AS last_modified_time,
dataset_id,
project_id
FROM `ironhacks-data.ironhacks_competition.__TABLES__`"""
query_job4 = bigquery_client.query(query4)
timestamp_data = query_job4.to_dataframe()
for cnt,row in timestamp_data.iterrows():
    print("\n")
    print(row["table_id"])
    print(row["creation_time"])
    print(row["last_modified_time"])