## 1. Data Engineering - Process CSV files into BQ Tables

### Create Spark session with BQ connector

Create a Spark session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType

from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName('Spark - Data Eng Demo') \
.config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') \
.getOrCreate()

Check the first 1000 bytes of a file on GCS

In [2]:
!gsutil cat -h -r 0-1000 gs://datalake-vol2-data/dataset/fraud_data.csv

==> gs://datalake-vol2-data/dataset/fraud_data.csv <==
step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0
1,DEBIT,9644.94,C1900366749,4465.0,0.0,C997608398,10845.0,157982.12,0,0
1,PAYMENT,3099.97,C249177573,20771.0,17671.03,M2096539129,0.0

In [3]:
path_to_train_csv = "gs://datalake-vol2-data/dataset/transaction_data_train.csv"

### Get Spark application ID 

This is useful to easily fine application in the Spark History UI

In [4]:
spark.conf.get("spark.app.id")

'application_1610100240292_0036'

Load the CSV file into a Spark Dataframe

In [5]:
df_transaction_data_from_csv = spark \
.read \
.option("inferSchema" , "true") \
.option("header" , "true") \
.csv(path_to_train_csv)

In [6]:
df_transaction_data_from_csv.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- transactionID: string (nullable = true)



### Create the BQ dataset and table

Create the BQ schema from the spark dataframe 
Reference for converting data types: https://github.com/GoogleCloudDataproc/spark-bigquery-connector#data-types

In [7]:
# spark to bq datatypes -> https://github.com/GoogleCloudDataproc/spark-bigquery-connector#data-types
schema_inline = df_transaction_data_from_csv.schema.simpleString().replace('struct<', '').replace('>', '').replace('int', 'int64').replace('double', 'float64')
schema_inline

'step:int64,type:string,amount:float64,oldbalanceOrg:float64,newbalanceOrig:float64,oldbalanceDest:float64,newbalanceDest:float64,isFraud:int64,transactionID:string'

In [8]:
df_transaction_data_from_csv.show(5)

+----+-----+-------+-------------+--------------+--------------+--------------+-------+--------------------+
|step| type| amount|oldbalanceOrg|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|       transactionID|
+----+-----+-------+-------------+--------------+--------------+--------------+-------+--------------------+
| 192|DEBIT|2129.14|     254447.0|     252317.86|     441844.59|     443973.73|      0|97b164f8-7ff8-4a7...|
| 395|DEBIT|9910.44|      24040.0|      14129.56|    2368599.34|    2378509.78|      0|80828313-2a6c-4ea...|
| 589|DEBIT|2984.18|      12732.0|       9747.82|     906466.61|     909450.79|      0|831843e6-7611-4a4...|
| 266|DEBIT|1747.32|     113637.0|     111889.68|       71894.4|      73641.72|      0|84449213-8dbb-4b1...|
| 241|DEBIT|   64.6|       1458.0|        1393.4|    3810660.18|    3810724.78|      0|14162c62-0eaf-40c...|
+----+-----+-------+-------------+--------------+--------------+--------------+-------+--------------------+
only showing top 5 

Run transformations on the data

In [None]:
## Any transformations on your data can be done at this point

In [9]:
# create the name your BQ dataset
project_id = !gcloud config list --format 'value(core.project)' 2>/dev/null 
dataset_name = project_id[0] + '-raw'
dataset_name = dataset_name.replace('-', '_')
dataset_name

'datalake_vol2_test'

Create the BQ dataset by specifying the location

In [10]:
!bq --location=europe-west3 mk -d \
{dataset_name}

Dataset 'datalake-vol2:datalake_vol2_test' successfully created.


In [11]:
# create path to new table for creation
bq_table_name = 'transaction_data_train'
bq_table_path= dataset_name + '.' + bq_table_name
bq_table_path

'datalake_vol2_test.transaction_data_train'

In [12]:
!bq mk --table \
{bq_table_path} \
{schema_inline}

Table 'datalake-vol2:datalake_vol2_test.transaction_data_train' successfully created.


#### Check that table was created

In [14]:
table = project_id[0] + ":" + bq_table_path
df_transaction_data_from_bq = spark.read \
.format("bigquery") \
.option("table", table) \
.load()

In [15]:
df_transaction_data_from_bq.printSchema()

root
 |-- step: long (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: long (nullable = true)
 |-- transactionID: string (nullable = true)



In [16]:
df_transaction_data_from_bq.show()

+----+----+------+-------------+--------------+--------------+--------------+-------+-------------+
|step|type|amount|oldbalanceOrg|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|transactionID|
+----+----+------+-------------+--------------+--------------+--------------+-------+-------------+
+----+----+------+-------------+--------------+--------------+--------------+-------+-------------+



Write spark dataframe to BQ table

In [17]:
# create temp GCS bucket for writing spark df to bq table
gcs_bucket = project_id[0] + '-data'
gcs_bucket

'datalake-vol2-data'

In [18]:
df_transaction_data_from_csv.write \
.format("bigquery") \
.option("table", table) \
.option("temporaryGcsBucket", gcs_bucket) \
.mode('overwrite') \
.save()

Check if the BQ table is populated 

In [21]:
df_transaction_data_from_bq = spark.read \
.format("bigquery") \
.option("table", table) \
.load()

In [23]:
df_transaction_data_from_bq.show()

+----+--------+---------+-------------+--------------+--------------+--------------+-------+--------------------+
|step|    type|   amount|oldbalanceOrg|newbalanceOrig|oldbalanceDest|newbalanceDest|isFraud|       transactionID|
+----+--------+---------+-------------+--------------+--------------+--------------+-------+--------------------+
| 404|CASH_OUT| 61985.03|      21261.0|           0.0|     2066461.5|    2128446.53|      0|19b86479-2c41-4cc...|
| 404|CASH_OUT| 66737.46|      21416.0|           0.0|      13733.51|      80470.96|      0|9675a1ec-4ec2-494...|
| 404|CASH_OUT|175684.96|     289863.0|     114178.04|    6499819.75|    6675504.71|      0|a26359df-18a6-40a...|
| 404|CASH_OUT|203996.28|       1311.0|           0.0|     400765.13|      604761.4|      0|aa83bb49-23ab-41b...|
| 404|CASH_OUT|162690.16|     107933.0|           0.0|    1099970.68|    1262660.85|      0|c0ddc848-5126-4a4...|
| 404|CASH_OUT|   4560.2|     225004.0|      220443.8|    2133814.55|    2138374.74|    

### Compute statistics for columns in table

In [33]:
df_transaction_data_from_bq.describe().show()

+-------+------------------+--------+------------------+-----------------+-----------------+------------------+------------------+--------------------+--------------------+
|summary|              step|    type|            amount|    oldbalanceOrg|   newbalanceOrig|    oldbalanceDest|    newbalanceDest|             isFraud|       transactionID|
+-------+------------------+--------+------------------+-----------------+-----------------+------------------+------------------+--------------------+--------------------+
|  count|           5090692| 5090692|           5090692|          5090692|          5090692|           5090692|           5090692|             5090692|             5090692|
|   mean|243.43184659374404|    null|179890.20229514304|834053.2275695637|855323.9460816883|1101305.5336388673|1225649.3290917806|0.001279786716619273|                null|
| stddev|142.32545787222878|    null| 603890.0044859855|2888545.884476551|2924500.744522634|3410172.1426297785|  3684651.53400185| 0.03