## TPC-DS BigQuery Import  
Import data from GCS to a previously created BigQuery dataset

In [None]:
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import config, schema, bq, load

In [None]:
pd.set_option("display.max_rows", 1000)

## 00. Qualification 1GB

### 00.A Create Dataset for Upload

In [None]:
dataset_name = 'ds_1GB_qual'

In [None]:
bq.create_dataset(dataset_name=dataset_name)

### 00.B Create BigQuery Schema to Upload To

In [None]:
schema_name = "bq_ds_0"

In [None]:
bq.create_schema(schema_file=config.fp_schema + config.sep + schema_name + ".ddl",
                 dataset=dataset_name)

In [None]:
# set n to about 5, high numbers run afoul of BQ's upload limits
# i.e. 403 - rateLimitExceeded
# https://cloud.google.com/bigquery/docs/error-messages
u = load.BQPooledUpload(dataset_name=dataset_name,
                        test="ds", scale=1, n=5)


### 00.C Upload from GCS to BigQuery

In [None]:
# Setting this to True will print out status during parallel pipeline loading
u.verbose = False

In [None]:
results = u.pipeline()

In [None]:
a = list(results)
a

In [None]:
dfx = pd.concat(u.up_data)
dfx.sort_index(inplace=True)

In [None]:
plt.plot(u.df.index, '+',
         markersize=10, 
         markerfacecolor=None,
         markeredgecolor='blue', label="queued");
plt.plot(dfx.index, 'x',
         markersize=10, 
         markerfacecolor=None,
         markeredgecolor='green', label="uploaded");
plt.legend();

In [None]:
dfx.head()

In [None]:
csv_fp = (config.fp_ds_output + config.sep + 
          "bq_upload-" + dataset_name + "-" + schema_name 
          + "-" + datetime.utcnow().strftime("%Y%m%d-%H%M%S") + ".csv")
csv_fp

In [None]:
dfx.to_csv(csv_fp)