In [1]:
import os

os.environ['SPARK_HOME'] = '/opt/spark'

import findspark

findspark.init()

In [2]:
import pyspark

In [3]:
from google.cloud import bigquery
from google.oauth2.service_account import Credentials

In [4]:
cred = Credentials.from_service_account_file(
    "/etc/momo-compute.json",
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

In [5]:
client = bigquery.Client(
    credentials=cred,
    project=cred.project_id
)

In [6]:
for dataset in client.list_datasets():
    print(dataset.dataset_id)

momodataset


In [7]:
table_ids = [
    table.table_id for table in client.list_tables('momodataset')
]

In [8]:
table_ids[:10]

['CUSTOMER_LOG_NAMED_1m',
 'CUSTOMER_LOG_NAMED_1w',
 'CUSTOMER_LOG_NAMED_1y',
 'CUSTOMER_LOG_NAMED_3m',
 'CUSTOMER_LOG_NAMED_6m',
 'CUSTOMER_LOG_PROCESSED_1d',
 'CUSTOMER_LOG_PROCESSED_1m',
 'CUSTOMER_LOG_PROCESSED_1w',
 'CUSTOMER_LOG_PROCESSED_1y',
 'CUSTOMER_LOG_PROCESSED_3m']

In [9]:
table_ids[-10:]

['substitute_same_MGROUP',
 'substitute_same_SGROUP',
 'warehouse_future_purchase_spu_sku',
 'warehouse_in_stock_group',
 'warehouse_in_stock_group_pivot',
 'warehouse_in_stock_spu_sku',
 'warehouse_sold_group',
 'warehouse_sold_group_pivot',
 'warehouse_sold_spu_sku',
 'warehouse_tbalju_cleaned']

In [10]:
spark = (
    pyspark
    .sql
    .SparkSession
    .builder
    .master('local[*]')
    .appName('spark-exercise-bq')
    .config('spark.driver.memory', '8g')
    .getOrCreate()
)
spark

In [11]:
import random

In [12]:
random.seed(3333)

In [13]:
table_name = table_ids[random.randint(0, len(table_ids))]
table_name

'ECOWNER_TF_EC_PROMO_M'

In [14]:
cred.project_id

'momo-logistics-pro'

In [15]:
sdf = (
    spark
    .read.format('bigquery')
    .option("credentialsFile", "/etc/momo-compute.json")
    .option("parentProject", cred.project_id)
    .option("dataset", "momodataset")
    .option("table", table_name)
    .option("viewsEnabled", "true")
    .load()
)

In [16]:
sdf.printSchema()

root
 |-- optype: string (nullable = true)
 |-- position: string (nullable = true)
 |-- PROMO_NO: string (nullable = true)
 |-- CHKOUT_TYPE: string (nullable = true)
 |-- DELIV_TYPE: string (nullable = true)
 |-- PROMO_START_DATE: timestamp (nullable = true)
 |-- PROMO_END_DATE: timestamp (nullable = true)
 |-- PROMO_NAME: string (nullable = true)
 |-- PROMO_DESCRIPT: string (nullable = true)
 |-- PROMO_URL: string (nullable = true)
 |-- USE_YN: string (nullable = true)
 |-- INSERT_ID: string (nullable = true)
 |-- INSERT_DATE: timestamp (nullable = true)
 |-- MODIFY_ID: string (nullable = true)
 |-- MODIFY_DATE: timestamp (nullable = true)
 |-- PROMO_TYPES: string (nullable = true)
 |-- PROMO_NOTE: string (nullable = true)
 |-- THRESHOLD_TYPE: string (nullable = true)
 |-- ORDER_PROMO: string (nullable = true)
 |-- GOODS_IMPORT_TYPE: string (nullable = true)
 |-- CART_NAME: string (nullable = true)
 |-- INTERNET_YN: string (nullable = true)
 |-- MOBILE_YN: string (nullable = true)
 |-