In [2]:
import findspark
findspark.init()

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [4]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /opt/hive3/lib/hive-hcatalog-core-3.1.2.jar pyspark-shell'

In [5]:
from pyspark.sql.session import SparkSession

spark = SparkSession.builder\
.appName("Stocks RAW to STD - DataFrames")\
.config("spark.sql.datalake.dir","hdfs://localhost:9000/datalake")\
.config("spark.sql.legacy.timeParserPolicy","LEGACY")\
.config("spark.sql.sources.partitionOverwriteMode","dynamic")\
.enableHiveSupport()\
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
airline_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/AIRLINE_STOCKS.csv", header=True)
                       
airline_stocks_raw.limit(5).toPandas()

                                                                                

Unnamed: 0,Datetime,EADSY_Close,BA_Close
0,2022-02-28 09:30:00-05:00,31.26,199.529999
1,2022-02-28 09:31:00-05:00,31.209999,198.389999
2,2022-02-28 09:32:00-05:00,31.1507,197.560196
3,2022-02-28 09:33:00-05:00,31.139999,197.699997
4,2022-02-28 09:34:00-05:00,,197.639999


In [7]:
(airline_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/airline"))

                                                                                

In [9]:
automotive_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/AUTOMOTIVE_STOCKS.csv", header=True)
                       
automotive_stocks_raw.limit(5).toPandas()

Unnamed: 0,Datetime,TSLA_Close,GM_Close,F_Close
0,2022-02-28 09:30:00-05:00,822.26001,46.27,17.370001
1,2022-02-28 09:31:00-05:00,818.580017,46.145,17.43
2,2022-02-28 09:32:00-05:00,823.400024,46.080002,17.4699
3,2022-02-28 09:33:00-05:00,826.72998,46.18,17.4599
4,2022-02-28 09:34:00-05:00,829.280029,46.259998,17.469999


In [10]:
(automotive_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/automotive"))

In [11]:
energy_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/ENERGY_STOCKS.csv", header=True)
                       
energy_stocks_raw.limit(5).toPandas()

Unnamed: 0,Datetime,XOM_Close,CVX_Close,SHELL_Close
0,2022-02-28 09:30:00-05:00,76.659599,139.550003,52.34
1,2022-02-28 09:31:00-05:00,76.610001,139.714996,52.380001
2,2022-02-28 09:32:00-05:00,76.559998,139.449997,52.289799
3,2022-02-28 09:33:00-05:00,76.6045,139.589996,52.23
4,2022-02-28 09:34:00-05:00,76.389999,139.2099,52.169998


In [12]:
(energy_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/energy"))

In [14]:
entertainment_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/ENTERTAINMENT_STOCKS.csv", header=True)
                       
entertainment_stocks_raw.limit(5).toPandas()

Unnamed: 0,Datetime,NFLX_Close,DIS_Close,EA_Close
0,2022-02-28 09:30:00-05:00,391.589905,148.429993,128.279999
1,2022-02-28 09:31:00-05:00,391.600006,148.210007,128.449997
2,2022-02-28 09:32:00-05:00,391.019989,148.419998,128.690002
3,2022-02-28 09:33:00-05:00,390.0,148.490005,129.220001
4,2022-02-28 09:34:00-05:00,389.975006,148.479996,129.249893


In [15]:
(entertainment_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/entertainment"))

                                                                                

In [16]:
finance_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/FINANCE_STOCKS.csv", header=True)
                       
finance_stocks_raw.limit(5).toPandas()

Unnamed: 0,Datetime,GS_Close,JPM_Close,AXP_Close
0,2022-02-28 09:30:00-05:00,341.410004,144.029999,189.759995
1,2022-02-28 09:31:00-05:00,340.480011,143.940002,189.630005
2,2022-02-28 09:32:00-05:00,340.880005,143.630005,189.660004
3,2022-02-28 09:33:00-05:00,340.369995,143.720093,189.229996
4,2022-02-28 09:34:00-05:00,340.410004,143.824997,190.175003


In [17]:
(finance_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/finance"))

In [26]:
tech_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/TECHNOLOGY_STOCKS.csv", header=True)
                       
tech_stocks_raw.limit(5).toPandas()

Unnamed: 0,Datetime,IBM_Close,MSFT_Close,INTC_Close
0,2022-02-28 09:30:00-05:00,122.379997,296.075012,47.34
1,2022-02-28 09:31:00-05:00,122.360001,295.76001,47.330002
2,2022-02-28 09:32:00-05:00,122.32,295.72641,47.32
3,2022-02-28 09:33:00-05:00,122.139999,295.459991,47.240002
4,2022-02-28 09:34:00-05:00,122.07,296.269989,47.150002


In [27]:
(tech_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/tech"))