# <b> 1. Reading and Cleaning Stock Price Data

In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
# Path To Files 
!hadoop fs -ls /user/itv015144/data/Stock_Ananlysis

Found 12 items
-rw-r--r--   3 itv015144 supergroup      76499 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/AAPL.csv
-rw-r--r--   3 itv015144 supergroup      76282 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/AMZN.csv
-rw-r--r--   3 itv015144 supergroup      78139 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/BRK-B.csv
-rw-r--r--   3 itv015144 supergroup      76248 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/GOOGL.csv
-rw-r--r--   3 itv015144 supergroup      78019 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/META.csv
-rw-r--r--   3 itv015144 supergroup      78045 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/MSFT.csv
-rw-r--r--   3 itv015144 supergroup      76004 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/NVDA.csv
-rw-r--r--   3 itv015144 supergroup      66751 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/QQQ.csv
-rw-r--r--   3 itv015144 supergroup      66851 2025-01-26 01:08 /user/itv015144/data/Stock_Ananlysis/SPY

In [4]:
## Reading CSV data => Stocks
stocks = spark.read \
.format("csv") \
.option("header",True) \
.load("/user/itv015144/data/Stock_Ananlysis/*")

In [5]:
## Seeing Data => Dataframe
stocks.show(5)

+------+----------+----------+-------+--------+--------+--------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low|
+------+----------+----------+-------+--------+--------+--------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |
+------+----------+----------+-------+--------+--------+--------+
only showing top 5 rows



In [6]:
## Seeing Schema of the Data => Data Types in Dataframe
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)



In [7]:
## Basic select operation => Select Ticker, Date and Close price

In [8]:
stocks.select("Ticker").show(5)

+------+
|Ticker|
+------+
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
| BRK-B|
+------+
only showing top 5 rows



In [9]:
stocks.select(["Ticker","Date","Open"]).show(5)

+------+----------+--------+
|Ticker|      Date|    Open|
+------+----------+--------+
| BRK-B|05/31/2023|$321.12 |
| BRK-B|05/30/2023|$321.86 |
| BRK-B|05/26/2023|$320.44 |
| BRK-B|05/25/2023|$320.56 |
| BRK-B|05/24/2023|$322.71 |
+------+----------+--------+
only showing top 5 rows



In [10]:
## Filtering Data => Select rows containing Microsoft Stock in last one month

In [11]:
stocks.filter(stocks.Ticker == "MSFT").show(10)

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
|  MSFT|05/30/2023|  $331.21 |29503070|$335.23 |$335.74 |$330.52 |
|  MSFT|05/26/2023|  $332.89 |36630630|$324.02 |$333.40 |$323.88 |
|  MSFT|05/25/2023|  $325.92 |43301740|$323.24 |$326.90 |$320.00 |
|  MSFT|05/24/2023|  $313.85 |23384890|$314.73 |$316.50 |$312.61 |
|  MSFT|05/23/2023|  $315.26 |30797170|$320.03 |$322.72 |$315.25 |
|  MSFT|05/22/2023|  $321.18 |24115660|$318.60 |$322.59 |$318.01 |
|  MSFT|05/19/2023|  $318.34 |27546700|$316.74 |$318.75 |$316.37 |
|  MSFT|05/18/2023|  $318.52 |27275990|$314.53 |$319.04 |$313.72 |
|  MSFT|05/17/2023|  $314.00 |24315010|$312.29 |$314.43 |$310.74 |
+------+----------+----------+--------+--------+--------+--------+
only showing top 10 rows



In [12]:
stocks.filter((stocks.Ticker == "MSFT") & (stocks.Date == "05/31/2023")).show()

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
+------+----------+----------+--------+--------+--------+--------+



In [13]:
stocks.filter(((stocks.Ticker == "MSFT") | (stocks.Ticker == "V")) & (stocks.Date == "05/31/2023")).show(15)

+------+----------+----------+--------+--------+--------+--------+
|Ticker|      Date|Close/Last|  Volume|    Open|    High|     Low|
+------+----------+----------+--------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 |45950550|$332.29 |$335.94 |$327.33 |
|     V|05/31/2023|  $221.03 |20460620|$219.96 |$221.53 |$216.14 |
+------+----------+----------+--------+--------+--------+--------+



In [14]:
stocks.filter((stocks.Ticker.isin(["MSFT","QQQ","V","SPY","TSLA"])) & (stocks.Date == "05/31/2023")).show()

+------+----------+----------+---------+--------+--------+--------+
|Ticker|      Date|Close/Last|   Volume|    Open|    High|     Low|
+------+----------+----------+---------+--------+--------+--------+
|  MSFT|05/31/2023|  $328.39 | 45950550|$332.29 |$335.94 |$327.33 |
|  TSLA|05/31/2023|  $203.93 |150711700|$199.78 |$203.95 |$195.12 |
|     V|05/31/2023|  $221.03 | 20460620|$219.96 |$221.53 |$216.14 |
|   SPY|05/31/2023|    417.85|110811800|  418.28|  419.22|  416.22|
|   QQQ|05/31/2023|    347.99| 65105380|  348.37|   350.6|  346.51|
+------+----------+----------+---------+--------+--------+--------+



In [15]:
# converting the string Date in Datetype

from pyspark.sql.functions import *

stocks = stocks.withColumn("ParseDate",to_date(stocks.Date,"MM/dd/yyyy"))

In [16]:
stocks.show()

+------+----------+----------+-------+--------+--------+--------+----------+
|Ticker|      Date|Close/Last| Volume|    Open|    High|     Low| ParseDate|
+------+----------+----------+-------+--------+--------+--------+----------+
| BRK-B|05/31/2023|  $321.08 |6175417|$321.12 |$322.41 |$319.39 |2023-05-31|
| BRK-B|05/30/2023|  $322.19 |3232461|$321.86 |$322.47 |$319.00 |2023-05-30|
| BRK-B|05/26/2023|  $320.60 |3229873|$320.44 |$322.63 |$319.67 |2023-05-26|
| BRK-B|05/25/2023|  $319.02 |4251935|$320.56 |$320.56 |$317.71 |2023-05-25|
| BRK-B|05/24/2023|  $320.20 |3075393|$322.71 |$323.00 |$319.56 |2023-05-24|
| BRK-B|05/23/2023|  $323.11 |4031342|$328.19 |$329.27 |$322.97 |2023-05-23|
| BRK-B|05/22/2023|  $329.13 |2763422|$330.75 |$331.49 |$328.35 |2023-05-22|
| BRK-B|05/19/2023|  $330.39 |4323538|$331.00 |$333.94 |$329.12 |2023-05-19|
| BRK-B|05/18/2023|  $329.76 |2808329|$326.87 |$329.98 |$325.85 |2023-05-18|
| BRK-B|05/17/2023|  $327.39 |3047626|$325.02 |$328.26 |$324.82 |2023-05-17|

In [17]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- ParseDate: date (nullable = true)



In [18]:
# cretae function to strip $ and then convert it into float
def num_parser(value):
    if isinstance(value, str):
        # Check if the string is 'null' or any other non-numeric string
        value = value.strip().lower()  # Strip any leading/trailing whitespace and convert to lowercase
        if value == 'null' or value == '':  # Handle 'null' or empty strings
            return None
        try:
            # Attempt to convert to float after stripping "$" symbol
            return float(value.strip("$"))
        except ValueError:
            # If conversion fails, return None
            return None
    elif isinstance(value, (int, float)):
        return value
    else:
        return None


print(num_parser("$5.5"))
print(num_parser("5.5"))
print(num_parser(5.5))
print(num_parser(5))
print(num_parser(5j))
print(num_parser("'CLECL'"))

from pyspark.sql.types import *

# converting function to UDF
parser_number = udf(num_parser,FloatType())

5.5
5.5
5.5
5
None
None


In [19]:
# apllying that UDF to column 
stocks = stocks.withColumn("Open",parser_number(stocks.Open)) \
.withColumn("Close",parser_number(stocks["Close/Last"])) \
.withColumn("High",parser_number(stocks.High)) \
.withColumn("Low",parser_number(stocks.Low))

In [20]:
stocks.show()

+------+----------+----------+-------+------+------+------+----------+------+
|Ticker|      Date|Close/Last| Volume|  Open|  High|   Low| ParseDate| Close|
+------+----------+----------+-------+------+------+------+----------+------+
| BRK-B|05/31/2023|  $321.08 |6175417|321.12|322.41|319.39|2023-05-31|321.08|
| BRK-B|05/30/2023|  $322.19 |3232461|321.86|322.47| 319.0|2023-05-30|322.19|
| BRK-B|05/26/2023|  $320.60 |3229873|320.44|322.63|319.67|2023-05-26| 320.6|
| BRK-B|05/25/2023|  $319.02 |4251935|320.56|320.56|317.71|2023-05-25|319.02|
| BRK-B|05/24/2023|  $320.20 |3075393|322.71| 323.0|319.56|2023-05-24| 320.2|
| BRK-B|05/23/2023|  $323.11 |4031342|328.19|329.27|322.97|2023-05-23|323.11|
| BRK-B|05/22/2023|  $329.13 |2763422|330.75|331.49|328.35|2023-05-22|329.13|
| BRK-B|05/19/2023|  $330.39 |4323538| 331.0|333.94|329.12|2023-05-19|330.39|
| BRK-B|05/18/2023|  $329.76 |2808329|326.87|329.98|325.85|2023-05-18|329.76|
| BRK-B|05/17/2023|  $327.39 |3047626|325.02|328.26|324.82|2023-

In [21]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParseDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [22]:
# create udf for converting volumn to int
parse_int = udf(lambda value:int(value),IntegerType())

In [23]:
stocks = stocks.withColumn("Volume",parse_int(stocks.Volume))

In [24]:
stocks.show()

+------+----------+----------+-------+------+------+------+----------+------+
|Ticker|      Date|Close/Last| Volume|  Open|  High|   Low| ParseDate| Close|
+------+----------+----------+-------+------+------+------+----------+------+
| BRK-B|05/31/2023|  $321.08 |6175417|321.12|322.41|319.39|2023-05-31|321.08|
| BRK-B|05/30/2023|  $322.19 |3232461|321.86|322.47| 319.0|2023-05-30|322.19|
| BRK-B|05/26/2023|  $320.60 |3229873|320.44|322.63|319.67|2023-05-26| 320.6|
| BRK-B|05/25/2023|  $319.02 |4251935|320.56|320.56|317.71|2023-05-25|319.02|
| BRK-B|05/24/2023|  $320.20 |3075393|322.71| 323.0|319.56|2023-05-24| 320.2|
| BRK-B|05/23/2023|  $323.11 |4031342|328.19|329.27|322.97|2023-05-23|323.11|
| BRK-B|05/22/2023|  $329.13 |2763422|330.75|331.49|328.35|2023-05-22|329.13|
| BRK-B|05/19/2023|  $330.39 |4323538| 331.0|333.94|329.12|2023-05-19|330.39|
| BRK-B|05/18/2023|  $329.76 |2808329|326.87|329.98|325.85|2023-05-18|329.76|
| BRK-B|05/17/2023|  $327.39 |3047626|325.02|328.26|324.82|2023-

In [25]:
stocks.printSchema()

root
 |-- Ticker: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Close/Last: string (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Open: float (nullable = true)
 |-- High: float (nullable = true)
 |-- Low: float (nullable = true)
 |-- ParseDate: date (nullable = true)
 |-- Close: float (nullable = true)



In [26]:
# Selecting Columns that we need
clean_stocks = stocks.select(["Ticker","ParseDate","Volume","Close","Open","High","Low"])
clean_stocks.show()

+------+----------+-------+------+------+------+------+
|Ticker| ParseDate| Volume| Close|  Open|  High|   Low|
+------+----------+-------+------+------+------+------+
| BRK-B|2023-05-31|6175417|321.08|321.12|322.41|319.39|
| BRK-B|2023-05-30|3232461|322.19|321.86|322.47| 319.0|
| BRK-B|2023-05-26|3229873| 320.6|320.44|322.63|319.67|
| BRK-B|2023-05-25|4251935|319.02|320.56|320.56|317.71|
| BRK-B|2023-05-24|3075393| 320.2|322.71| 323.0|319.56|
| BRK-B|2023-05-23|4031342|323.11|328.19|329.27|322.97|
| BRK-B|2023-05-22|2763422|329.13|330.75|331.49|328.35|
| BRK-B|2023-05-19|4323538|330.39| 331.0|333.94|329.12|
| BRK-B|2023-05-18|2808329|329.76|326.87|329.98|325.85|
| BRK-B|2023-05-17|3047626|327.39|325.02|328.26|324.82|
| BRK-B|2023-05-16|2139996|323.75|322.46|324.69|322.36|
| BRK-B|2023-05-15|2191609|323.53|322.89|323.83|320.13|
| BRK-B|2023-05-12|1938264|322.49|323.82|324.24|320.54|
| BRK-B|2023-05-11|2549339|322.64| 321.0|322.96|319.81|
| BRK-B|2023-05-10|2641134|322.99|326.08|326.16|

In [27]:
clean_stocks.describe(["Volume","Close","Open","High","Low"]).show()

+-------+--------------------+------------------+------------------+------------------+------------------+
|summary|              Volume|             Close|              Open|              High|               Low|
+-------+--------------------+------------------+------------------+------------------+------------------+
|  count|               15108|             15108|             15108|             15108|             15108|
|   mean|5.1868408793685466E7| 180.1256089860054|180.09656566181036| 182.1253348687101| 177.9982781513109|
| stddev| 5.496484129953464E7|101.14891782168543|101.16125813324383|101.96625521621753|100.26590135955234|
|    min|              961133|             11.93|             12.07|             12.45|              11.8|
|    max|           914080943|            477.71|            479.22|            479.98|            476.06|
+-------+--------------------+------------------+------------------+------------------+------------------+



# <b> 2. Basic Stock Analysis

In [28]:
## Calculate maximum stock price for various stocks
clean_stocks.groupBy(clean_stocks.Ticker).agg(
    max(clean_stocks.Open).alias("MaxStockPrice"),
    sum(clean_stocks.Volume).alias("TotalVolume")).show()

+------+-------------+------------+
|Ticker|MaxStockPrice| TotalVolume|
+------+-------------+------------+
| BRK-B|       361.39|  5862401321|
|   TSM|       141.61| 12506470104|
|  AAPL|       182.63|139310061360|
|  META|       381.68| 30148848043|
|  TSLA|       411.47|171802975076|
|   QQQ|       405.57| 60437153773|
|     V|       250.05| 10410997871|
| GOOGL|       151.25| 43956560981|
|   SPY|       479.22|107925285300|
|  AMZN|        187.2|104503287430|
|  MSFT|       344.62| 37976660472|
|  NVDA|       405.95| 58787218324|
+------+-------------+------------+



In [29]:
# spliting Date Column in Different Component
clean_stocks = (clean_stocks.withColumn("Year",year("ParseDate"))
             .withColumn("Month",month("ParseDate"))
             .withColumn("Day",dayofmonth("ParseDate"))
             .withColumn("Week",weekofyear("ParseDate")))

In [30]:
clean_stocks.show()

+------+----------+-------+------+------+------+------+----+-----+---+----+
|Ticker| ParseDate| Volume| Close|  Open|  High|   Low|Year|Month|Day|Week|
+------+----------+-------+------+------+------+------+----+-----+---+----+
| BRK-B|2023-05-31|6175417|321.08|321.12|322.41|319.39|2023|    5| 31|  22|
| BRK-B|2023-05-30|3232461|322.19|321.86|322.47| 319.0|2023|    5| 30|  22|
| BRK-B|2023-05-26|3229873| 320.6|320.44|322.63|319.67|2023|    5| 26|  21|
| BRK-B|2023-05-25|4251935|319.02|320.56|320.56|317.71|2023|    5| 25|  21|
| BRK-B|2023-05-24|3075393| 320.2|322.71| 323.0|319.56|2023|    5| 24|  21|
| BRK-B|2023-05-23|4031342|323.11|328.19|329.27|322.97|2023|    5| 23|  21|
| BRK-B|2023-05-22|2763422|329.13|330.75|331.49|328.35|2023|    5| 22|  21|
| BRK-B|2023-05-19|4323538|330.39| 331.0|333.94|329.12|2023|    5| 19|  20|
| BRK-B|2023-05-18|2808329|329.76|326.87|329.98|325.85|2023|    5| 18|  20|
| BRK-B|2023-05-17|3047626|327.39|325.02|328.26|324.82|2023|    5| 17|  20|
| BRK-B|2023

In [31]:
# calculate Yearly Hig And Low Stockes Price
yearly = clean_stocks.groupBy(["Ticker","Year"]).agg(max("Open").alias("YearlyHigh"),min("Open").alias("YearlyLow")).sort("Ticker","Year")

In [32]:
yearly.show()

+------+----+----------+---------+
|Ticker|Year|YearlyHigh|YearlyLow|
+------+----+----------+---------+
|  AAPL|2018|      57.7|    37.04|
|  AAPL|2019|     72.78|     36.0|
|  AAPL|2020|    138.05|    57.02|
|  AAPL|2021|    181.12|   119.03|
|  AAPL|2022|    182.63|   127.99|
|  AAPL|2023|    177.33|   126.01|
|  AMZN|2018|    101.91|     67.3|
|  AMZN|2019|    101.28|    73.26|
|  AMZN|2020|    177.35|    82.08|
|  AMZN|2021|     187.2|   150.25|
|  AMZN|2022|    170.44|     82.8|
|  AMZN|2023|    122.37|    83.03|
| BRK-B|2018|     224.0|   185.43|
| BRK-B|2019|    227.27|   194.78|
| BRK-B|2020|    233.92|    165.3|
| BRK-B|2021|    300.88|   228.21|
| BRK-B|2022|    361.39|   260.58|
| BRK-B|2023|     331.0|   294.68|
| GOOGL|2018|     64.46|    49.22|
| GOOGL|2019|      68.2|    51.36|
+------+----+----------+---------+
only showing top 20 rows



In [33]:
# Calculating Monthly High And Low
monthly = clean_stocks.groupBy(["Ticker","Year","Month"]).agg(max("Open").alias("MonthlyHigh"),min("Open").alias("MonthlyLow")).sort("Ticker","Year","Month")

In [34]:
monthly.show()

+------+----+-----+-----------+----------+
|Ticker|Year|Month|MonthlyHigh|MonthlyLow|
+------+----+-----+-----------+----------+
|  AAPL|2018|    5|      46.81|     46.81|
|  AAPL|2018|    6|      48.54|     45.75|
|  AAPL|2018|    7|      48.75|     45.96|
|  AAPL|2018|    8|      56.63|     49.78|
|  AAPL|2018|    9|      57.25|     54.21|
|  AAPL|2018|   10|       57.7|     52.79|
|  AAPL|2018|   11|      54.76|     42.88|
|  AAPL|2018|   12|      46.12|     37.04|
|  AAPL|2019|    1|      41.53|      36.0|
|  AAPL|2019|    2|      43.66|     41.74|
|  AAPL|2019|    3|      48.84|     42.58|
|  AAPL|2019|    4|      51.84|     47.77|
|  AAPL|2019|    5|      52.72|     44.06|
|  AAPL|2019|    6|      50.09|     43.86|
|  AAPL|2019|    7|      54.11|      49.8|
|  AAPL|2019|    8|      53.48|     48.85|
|  AAPL|2019|    9|       56.2|     51.61|
|  AAPL|2019|   10|      62.24|     54.61|
|  AAPL|2019|   11|      66.98|     62.39|
|  AAPL|2019|   12|      72.78|     64.58|
+------+---

In [35]:
# Calculating Weely High And Low
Weekly = clean_stocks.groupBy(["Ticker","Year","Week"]).agg(max("Open").alias("WeeklyHigh"),min("Open").alias("WeeklyLow")).sort("Ticker","Year","Week")

In [36]:
Weekly.show()

+------+----+----+----------+---------+
|Ticker|Year|Week|WeeklyHigh|WeeklyLow|
+------+----+----+----------+---------+
|  AAPL|2018|   1|     39.63|    39.63|
|  AAPL|2018|  22|      47.0|    46.81|
|  AAPL|2018|  23|     48.54|    47.79|
|  AAPL|2018|  24|     48.11|    47.51|
|  AAPL|2018|  25|     46.97|    46.29|
|  AAPL|2018|  26|     46.57|    45.75|
|  AAPL|2018|  27|     46.95|    45.96|
|  AAPL|2018|  28|     47.77|    47.13|
|  AAPL|2018|  29|     47.95|    47.42|
|  AAPL|2018|  30|     48.75|    47.67|
|  AAPL|2018|  31|     51.76|    47.58|
|  AAPL|2018|  32|     52.33|    51.51|
|  AAPL|2018|  33|     53.36|    51.93|
|  AAPL|2018|  34|     54.53|    53.53|
|  AAPL|2018|  35|     56.63|    54.29|
|  AAPL|2018|  36|     57.25|    55.46|
|  AAPL|2018|  37|     56.44|     54.5|
|  AAPL|2018|  38|     55.54|    54.45|
|  AAPL|2018|  39|      56.2|    54.21|
|  AAPL|2018|  40|      57.7|    56.81|
+------+----+----+----------+---------+
only showing top 20 rows



# <b> 3. Joins

In [37]:
# Performing an inner join between clean_stocks and yearly DataFrames on Ticker and Year
yearly = yearly.withColumnRenamed("Ticker", "Yearly_Ticker").withColumnRenamed("Year", "Yearly_Year")

historic_stocks = clean_stocks.join(
    yearly,
    (clean_stocks.Ticker == yearly.Yearly_Ticker) & (clean_stocks.Year == yearly.Yearly_Year),  # Join condition
    "inner"  # Type of join (only matching rows from both DataFrames)
).drop("Yearly_Ticker","Yearly_Year")  # Drop the duplicate Ticker and Year columns from the 'yearly' DataFrame

In [38]:
historic_stocks.show()

+------+----------+-------+------+------+------+------+----+-----+---+----+----------+---------+
|Ticker| ParseDate| Volume| Close|  Open|  High|   Low|Year|Month|Day|Week|YearlyHigh|YearlyLow|
+------+----------+-------+------+------+------+------+----+-----+---+----+----------+---------+
| BRK-B|2023-05-31|6175417|321.08|321.12|322.41|319.39|2023|    5| 31|  22|     331.0|   294.68|
| BRK-B|2023-05-30|3232461|322.19|321.86|322.47| 319.0|2023|    5| 30|  22|     331.0|   294.68|
| BRK-B|2023-05-26|3229873| 320.6|320.44|322.63|319.67|2023|    5| 26|  21|     331.0|   294.68|
| BRK-B|2023-05-25|4251935|319.02|320.56|320.56|317.71|2023|    5| 25|  21|     331.0|   294.68|
| BRK-B|2023-05-24|3075393| 320.2|322.71| 323.0|319.56|2023|    5| 24|  21|     331.0|   294.68|
| BRK-B|2023-05-23|4031342|323.11|328.19|329.27|322.97|2023|    5| 23|  21|     331.0|   294.68|
| BRK-B|2023-05-22|2763422|329.13|330.75|331.49|328.35|2023|    5| 22|  21|     331.0|   294.68|
| BRK-B|2023-05-19|4323538|330

In [39]:
# joining Historic Data With Weekly to addiing column weeklyhigh as  weeklylow
Weekly = Weekly.withColumnRenamed("Ticker", "Weekly_Ticker").withColumnRenamed("Year", "Weekly_Year").withColumnRenamed("Week", "Weekly_Week")

historic_stocks = historic_stocks.join(Weekly,
                    (historic_stocks.Ticker == Weekly.Weekly_Ticker) 
                     & (historic_stocks.Year == Weekly.Weekly_Year)
                     & (historic_stocks.Week == Weekly.Weekly_Week),
                     "inner"
                    ).drop("Weekly_Ticker","Weekly_Year","Weekly_Week")

In [40]:
historic_stocks.show()

+------+----------+-------+------+------+------+------+----+-----+---+----+----------+---------+----------+---------+
|Ticker| ParseDate| Volume| Close|  Open|  High|   Low|Year|Month|Day|Week|YearlyHigh|YearlyLow|WeeklyHigh|WeeklyLow|
+------+----------+-------+------+------+------+------+----+-----+---+----+----------+---------+----------+---------+
| BRK-B|2023-05-31|6175417|321.08|321.12|322.41|319.39|2023|    5| 31|  22|     331.0|   294.68|    321.86|   321.12|
| BRK-B|2023-05-30|3232461|322.19|321.86|322.47| 319.0|2023|    5| 30|  22|     331.0|   294.68|    321.86|   321.12|
| BRK-B|2023-05-26|3229873| 320.6|320.44|322.63|319.67|2023|    5| 26|  21|     331.0|   294.68|    330.75|   320.44|
| BRK-B|2023-05-25|4251935|319.02|320.56|320.56|317.71|2023|    5| 25|  21|     331.0|   294.68|    330.75|   320.44|
| BRK-B|2023-05-24|3075393| 320.2|322.71| 323.0|319.56|2023|    5| 24|  21|     331.0|   294.68|    330.75|   320.44|
| BRK-B|2023-05-23|4031342|323.11|328.19|329.27|322.97|2

In [41]:
# new Method to join Data
historic_stocks = historic_stocks.join(monthly,["Ticker","Year","Month"])

In [42]:
historic_stocks.show()

+------+----+-----+----------+-------+------+------+------+------+---+----+----------+---------+----------+---------+-----------+----------+
|Ticker|Year|Month| ParseDate| Volume| Close|  Open|  High|   Low|Day|Week|YearlyHigh|YearlyLow|WeeklyHigh|WeeklyLow|MonthlyHigh|MonthlyLow|
+------+----+-----+----------+-------+------+------+------+------+---+----+----------+---------+----------+---------+-----------+----------+
| BRK-B|2023|    5|2023-05-31|6175417|321.08|321.12|322.41|319.39| 31|  22|     331.0|   294.68|    321.86|   321.12|      331.0|    320.44|
| BRK-B|2023|    5|2023-05-30|3232461|322.19|321.86|322.47| 319.0| 30|  22|     331.0|   294.68|    321.86|   321.12|      331.0|    320.44|
| BRK-B|2023|    5|2023-05-26|3229873| 320.6|320.44|322.63|319.67| 26|  21|     331.0|   294.68|    330.75|   320.44|      331.0|    320.44|
| BRK-B|2023|    5|2023-05-25|4251935|319.02|320.56|320.56|317.71| 25|  21|     331.0|   294.68|    330.75|   320.44|      331.0|    320.44|
| BRK-B|2023|

In [43]:
historic_stocks.columns

['Ticker',
 'Year',
 'Month',
 'ParseDate',
 'Volume',
 'Close',
 'Open',
 'High',
 'Low',
 'Day',
 'Week',
 'YearlyHigh',
 'YearlyLow',
 'WeeklyHigh',
 'WeeklyLow',
 'MonthlyHigh',
 'MonthlyLow']

In [44]:
final_stocks = historic_stocks.select(['Ticker','Year','Month','Day','Week','Volume','Close','Open','High','Low',
                                       'YearlyHigh','YearlyLow','WeeklyHigh','WeeklyLow','MonthlyHigh','MonthlyLow'])

In [45]:
final_stocks.show()

+------+----+-----+---+----+-------+------+------+------+------+----------+---------+----------+---------+-----------+----------+
|Ticker|Year|Month|Day|Week| Volume| Close|  Open|  High|   Low|YearlyHigh|YearlyLow|WeeklyHigh|WeeklyLow|MonthlyHigh|MonthlyLow|
+------+----+-----+---+----+-------+------+------+------+------+----------+---------+----------+---------+-----------+----------+
| BRK-B|2023|    5| 31|  22|6175417|321.08|321.12|322.41|319.39|     331.0|   294.68|    321.86|   321.12|      331.0|    320.44|
| BRK-B|2023|    5| 30|  22|3232461|322.19|321.86|322.47| 319.0|     331.0|   294.68|    321.86|   321.12|      331.0|    320.44|
| BRK-B|2023|    5| 26|  21|3229873| 320.6|320.44|322.63|319.67|     331.0|   294.68|    330.75|   320.44|      331.0|    320.44|
| BRK-B|2023|    5| 25|  21|4251935|319.02|320.56|320.56|317.71|     331.0|   294.68|    330.75|   320.44|      331.0|    320.44|
| BRK-B|2023|    5| 24|  21|3075393| 320.2|322.71| 323.0|319.56|     331.0|   294.68|    3

In [46]:
# Sql Quries Using pyspark
final_stocks.createOrReplaceTempView("stocks")

In [47]:
spark.sql("select * from stocks where Ticker = 'MSFT' and Year = '2023' ").show()

+------+----+-----+---+----+--------+------+------+------+------+----------+---------+----------+---------+-----------+----------+
|Ticker|Year|Month|Day|Week|  Volume| Close|  Open|  High|   Low|YearlyHigh|YearlyLow|WeeklyHigh|WeeklyLow|MonthlyHigh|MonthlyLow|
+------+----+-----+---+----+--------+------+------+------+------+----------+---------+----------+---------+-----------+----------+
|  MSFT|2023|    5| 31|  22|45950550|328.39|332.29|335.94|327.33|    335.23|    223.0|    335.23|   332.29|     335.23|    305.72|
|  MSFT|2023|    5| 30|  22|29503070|331.21|335.23|335.74|330.52|    335.23|    223.0|    335.23|   332.29|     335.23|    305.72|
|  MSFT|2023|    5| 26|  21|36630630|332.89|324.02| 333.4|323.88|    335.23|    223.0|    324.02|   314.73|     335.23|    305.72|
|  MSFT|2023|    5| 25|  21|43301740|325.92|323.24| 326.9| 320.0|    335.23|    223.0|    324.02|   314.73|     335.23|    305.72|
|  MSFT|2023|    5| 24|  21|23384890|313.85|314.73| 316.5|312.61|    335.23|    223

# <b> 4. Advanced Analysis

In [48]:
clean_stocks.show()

+------+----------+-------+------+------+------+------+----+-----+---+----+
|Ticker| ParseDate| Volume| Close|  Open|  High|   Low|Year|Month|Day|Week|
+------+----------+-------+------+------+------+------+----+-----+---+----+
| BRK-B|2023-05-31|6175417|321.08|321.12|322.41|319.39|2023|    5| 31|  22|
| BRK-B|2023-05-30|3232461|322.19|321.86|322.47| 319.0|2023|    5| 30|  22|
| BRK-B|2023-05-26|3229873| 320.6|320.44|322.63|319.67|2023|    5| 26|  21|
| BRK-B|2023-05-25|4251935|319.02|320.56|320.56|317.71|2023|    5| 25|  21|
| BRK-B|2023-05-24|3075393| 320.2|322.71| 323.0|319.56|2023|    5| 24|  21|
| BRK-B|2023-05-23|4031342|323.11|328.19|329.27|322.97|2023|    5| 23|  21|
| BRK-B|2023-05-22|2763422|329.13|330.75|331.49|328.35|2023|    5| 22|  21|
| BRK-B|2023-05-19|4323538|330.39| 331.0|333.94|329.12|2023|    5| 19|  20|
| BRK-B|2023-05-18|2808329|329.76|326.87|329.98|325.85|2023|    5| 18|  20|
| BRK-B|2023-05-17|3047626|327.39|325.02|328.26|324.82|2023|    5| 17|  20|
| BRK-B|2023

In [50]:
# select required column for prcoessing for get performances benifit
snapshot = clean_stocks.select(["Ticker","ParseDate","Open"])

In [51]:
snapshot.show()

+------+----------+------+
|Ticker| ParseDate|  Open|
+------+----------+------+
| BRK-B|2023-05-31|321.12|
| BRK-B|2023-05-30|321.86|
| BRK-B|2023-05-26|320.44|
| BRK-B|2023-05-25|320.56|
| BRK-B|2023-05-24|322.71|
| BRK-B|2023-05-23|328.19|
| BRK-B|2023-05-22|330.75|
| BRK-B|2023-05-19| 331.0|
| BRK-B|2023-05-18|326.87|
| BRK-B|2023-05-17|325.02|
| BRK-B|2023-05-16|322.46|
| BRK-B|2023-05-15|322.89|
| BRK-B|2023-05-12|323.82|
| BRK-B|2023-05-11| 321.0|
| BRK-B|2023-05-10|326.08|
| BRK-B|2023-05-09|324.87|
| BRK-B|2023-05-08|328.26|
| BRK-B|2023-05-05|323.36|
| BRK-B|2023-05-04|323.44|
| BRK-B|2023-05-03|327.13|
+------+----------+------+
only showing top 20 rows



In [57]:
#Calulate Previousday open price for stocks

from pyspark.sql.window import Window

lag1Day = Window.partitionBy(snapshot.Ticker).orderBy(snapshot.ParseDate)

snapshot.withColumn("PreviousOpen",lag("Open").over(lag1Day)).show()

+------+----------+------+------------+
|Ticker| ParseDate|  Open|PreviousOpen|
+------+----------+------+------------+
| BRK-B|2018-05-31|194.29|        null|
| BRK-B|2018-06-01| 192.9|      194.29|
| BRK-B|2018-06-04| 193.0|       192.9|
| BRK-B|2018-06-05|191.37|       193.0|
| BRK-B|2018-06-06|191.69|      191.37|
| BRK-B|2018-06-07|194.66|      191.69|
| BRK-B|2018-06-08|195.34|      194.66|
| BRK-B|2018-06-11| 196.3|      195.34|
| BRK-B|2018-06-12|195.97|       196.3|
| BRK-B|2018-06-13|195.13|      195.97|
| BRK-B|2018-06-14|194.92|      195.13|
| BRK-B|2018-06-15|192.28|      194.92|
| BRK-B|2018-06-18|190.64|      192.28|
| BRK-B|2018-06-19|190.05|      190.64|
| BRK-B|2018-06-20|190.47|      190.05|
| BRK-B|2018-06-21|189.05|      190.47|
| BRK-B|2018-06-22| 190.0|      189.05|
| BRK-B|2018-06-25|188.63|       190.0|
| BRK-B|2018-06-26|186.52|      188.63|
| BRK-B|2018-06-27| 186.5|      186.52|
+------+----------+------+------------+
only showing top 20 rows



In [60]:
# Calulating Moving Avg 

movingAvg = Window.partitionBy(snapshot.Ticker).orderBy(snapshot.ParseDate).rowsBetween(-50,Window.currentRow) 
# here we calulating moving avg for last 50 day

snapshot.withColumn("MA50",avg(snapshot.Open).over(movingAvg)).withColumn("MA50",round("MA50",2)).show()

+------+----------+------+------+
|Ticker| ParseDate|  Open|  MA50|
+------+----------+------+------+
| BRK-B|2018-05-31|194.29|194.29|
| BRK-B|2018-06-01| 192.9|193.59|
| BRK-B|2018-06-04| 193.0| 193.4|
| BRK-B|2018-06-05|191.37|192.89|
| BRK-B|2018-06-06|191.69|192.65|
| BRK-B|2018-06-07|194.66|192.98|
| BRK-B|2018-06-08|195.34|193.32|
| BRK-B|2018-06-11| 196.3|193.69|
| BRK-B|2018-06-12|195.97|193.95|
| BRK-B|2018-06-13|195.13|194.06|
| BRK-B|2018-06-14|194.92|194.14|
| BRK-B|2018-06-15|192.28|193.99|
| BRK-B|2018-06-18|190.64|193.73|
| BRK-B|2018-06-19|190.05|193.47|
| BRK-B|2018-06-20|190.47|193.27|
| BRK-B|2018-06-21|189.05| 193.0|
| BRK-B|2018-06-22| 190.0|192.83|
| BRK-B|2018-06-25|188.63|192.59|
| BRK-B|2018-06-26|186.52|192.27|
| BRK-B|2018-06-27| 186.5|191.99|
+------+----------+------+------+
only showing top 20 rows



In [62]:
# calculating top N Stocke Open Price For each Stocks

Maxstocks = Window.partitionBy(snapshot.Ticker).orderBy(snapshot.Open.desc())

snapshot.withColumn("MaxOpen",row_number().over(Maxstocks)).filter("MaxOpen <=5").show()

+------+----------+------+-------+
|Ticker| ParseDate|  Open|MaxOpen|
+------+----------+------+-------+
| BRK-B|2022-03-29|361.39|      1|
| BRK-B|2022-03-28|360.59|      2|
| BRK-B|2022-03-31| 359.0|      3|
| BRK-B|2022-03-30|354.66|      4|
| BRK-B|2022-03-25| 353.9|      5|
|   TSM|2021-02-16|141.61|      1|
|   TSM|2022-01-13|140.75|      2|
|   TSM|2021-02-17|139.21|      3|
|   TSM|2021-02-12|138.92|      4|
|   TSM|2022-01-18|136.75|      5|
|  AAPL|2022-01-04|182.63|      1|
|  AAPL|2021-12-13|181.12|      2|
|  AAPL|2021-12-28|180.16|      3|
|  AAPL|2022-01-05|179.61|      4|
|  AAPL|2021-12-30|179.47|      5|
|  META|2021-09-13|381.68|      1|
|  META|2021-09-02| 381.5|      2|
|  META|2021-09-10|381.36|      3|
|  META|2021-09-08|380.16|      4|
|  META|2021-08-31|379.95|      5|
+------+----------+------+-------+
only showing top 20 rows

