# Spark SQL with pySpark

Answer the following questions

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType, DateType

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("google.cloud.auth.service.account.json.keyfile", "< json_path >") \
    .config("spark.executor.memory", "3g") \
    .config("spark.jars", "/usr/local/spark/jars/gcs-connector-hadoop2-latest.jar") \
    .appName("GCS_DataLake") \
    .getOrCreate()

In [10]:
gcs_path = "gs://solutions-public-assets/time-series-master/GBPUSD_*.csv"

In [13]:
schema = StructType([
    StructField("venue", StringType()),
    StructField("currencies", StringType()),
    StructField("time_stamp", TimestampType()),
    StructField("bid", DoubleType()),
    StructField("ask", DoubleType())
])

df=spark.read.csv(gcs_path, header=False, schema=schema)
df.show()

+-----+----------+--------------------+------+------+
|venue|currencies|          time_stamp|   bid|   ask|
+-----+----------+--------------------+------+------+
|  XYZ|   GBP/USD| 2014-01-01 00:00:00|1.4995|1.5005|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4988|1.4998|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4979|1.4989|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4993|1.5003|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4989|1.4999|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4998|1.5008|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.5001|1.5011|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4991|1.5001|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4978|1.4988|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4974|1.4984|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4987|1.4997|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4979|1.4989|
|  XYZ|   GBP/USD|2014-01-01 00:00:...|1.4979|1.4989|
|  XYZ|   GBP/USD|2014-01-01 00:01:...|1.4991|1.5001|
|  XYZ|   GBP/USD|2014-01-01 00:01:...|1.4997|1.5007|
|  XYZ|   GBP/USD|2014-01-01