In [1]:
import findspark
findspark.init()

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /opt/hive3/lib/hive-hcatalog-core-3.1.2.jar pyspark-shell'

In [4]:
from pyspark.sql.session import SparkSession

spark = SparkSession.builder\
.appName("Stocks RAW to STD - DataFrames")\
.config("spark.sql.datalake.dir","hdfs://localhost:9000/datalake")\
.config("spark.sql.legacy.timeParserPolicy","LEGACY")\
.config("spark.sql.sources.partitionOverwriteMode","dynamic")\
.enableHiveSupport()\
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
airline_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_airline_csv", header=True)
                       
airline_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,0.000564,-0.003872,-0.005156,0
1,1,-0.000262,0.000564,-0.003872,0
2,2,-0.000699,-0.000262,0.000564,1
3,3,0.001094,-0.000699,-0.000262,0
4,4,-0.00201,0.001094,-0.000699,1


In [6]:
(airline_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_airline_csv"))

                                                                                

In [7]:
automotive_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_automotive_csv", header=True)
                       
automotive_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,0.003856,0.005435,-0.004227,1
1,1,0.002965,0.003856,0.005435,0
2,2,-0.00205,0.002965,0.003856,0
3,3,-0.002154,-0.00205,0.002965,0
4,4,-0.001822,-0.002154,-0.00205,0


In [8]:
(automotive_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_automotive_csv"))

In [9]:
energy_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_energy_csv", header=True)
                       
energy_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,0.000465,-0.001508,0.000579,0
1,1,-0.002439,0.000465,-0.001508,1
2,2,0.003459,-0.002439,0.000465,1
3,3,0.004852,0.003459,-0.002439,0
4,4,-0.000955,0.004852,0.003459,1


In [10]:
(energy_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_energy_csv"))

In [11]:
entertainment_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_entertainment_csv", header=True)
                       
entertainment_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,-0.000629,-0.000195,-6e-05,0
1,1,-8e-06,-0.000629,-0.000195,0
2,2,-0.000599,-8e-06,-0.000629,0
3,3,-0.001386,-0.000599,-8e-06,0
4,4,-0.002026,-0.001386,-0.000599,0


In [12]:
(entertainment_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_entertainment_csv"))

In [13]:
finance_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_finance_csv", header=True)
                       
finance_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,-0.001261,0.000178,-0.001703,0
1,1,0.001619,-0.001261,0.000178,0
2,2,0.001676,0.001619,-0.001261,0
3,3,0.003242,0.001676,0.001619,0
4,4,-0.001719,0.003242,0.001676,0


In [14]:
(finance_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_finance_csv"))

In [16]:
tech_stocks_raw = spark.read.option("inferSchema","true")\
                       .option("recursiveFileLookup", "true")\
                       .csv("hdfs://localhost:9000/datalake/raw/stocks/clean_tech_csv", header=True)
                       
tech_stocks_raw.limit(5).toPandas()

Unnamed: 0,_c0,slope_x1,slope_x2,slope_x3,buy_sell_decision
0,0,-0.001131,-0.00018,-0.000741,1
1,1,0.001398,-0.001131,-0.00018,0
2,2,-0.000161,0.001398,-0.001131,1
3,3,0.001128,-0.000161,0.001398,0
4,4,-0.001438,0.001128,-0.000161,0


In [17]:
(tech_stocks_raw.coalesce(1)
          .write
          .mode("append")
          .parquet("hdfs://localhost:9000/datalake/std/stocks/clean_tech_csv"))