In [8]:
# To do
#Split into multiple scripts
# Apply robust enterprise level logging
# Apply unit testing and integration testing logic.
# Can I cache a particular table to speed up downstream processing?

#install polyonc client
!pip install polygon-api-client

pip show pyspark
#import relevant libraries or modules
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, explode, from_unixtime, regexp_replace, when, unix_timestamp, date_format
from pyspark.sql.types import StructType, StructField, StringType,DoubleType,LongType, ArrayType
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import json
from logging_config  import setup_logging

import os
print(os.listdir())

In [14]:
#Create sparksession
spark = SparkSession.builder.master("local[*]").appName("stock_analysis_2023"). \
        getOrCreate()

def read_csv_into_df(stock_list):
    #Read csv file
    df_stock_name = spark.read.csv(stock_list, header=True, inferSchema=True)
    
    df_stock_name.write.parquet("stock_names_to_analyse_parquet",mode="overwrite")
    df_read=spark.read.parquet("stock_names_to_analyse_parquet")
    df_read.printSchema()
    df = df_stock_name.withColumn('symbol', 
                    when(col("symbol")==  'FB','META')
                .when(col("symbol")== 'ANTM','ELV')
                                  .otherwise(col('symbol'))
                                         )
    
    #Get the list of ticker symbols from the dataframe.
    symbols_to_analyse = [row['symbol'] for row in df.select("symbol").collect()]
    
    #check we have 100
    print(f"Number of tickers: {len(symbols_to_analyse)}")
    return symbols_to_analyse

symbols_to_analyse=read_csv_into_df(stock_list="stocks.csv")

root
 |-- company_name: string (nullable = true)
 |-- symbol: string (nullable = true)

Number of tickers: 100


In [15]:
symbols_to_analyse

['AAPL',
 'MSFT',
 'AMZN',
 'TSLA',
 'GOOGL',
 'GOOG',
 'BRK.B',
 'JNJ',
 'UNH',
 'NVDA',
 'META',
 'PG',
 'JPM',
 'XOM',
 'V',
 'HD',
 'CVX',
 'MA',
 'ABBV',
 'PFE',
 'BAC',
 'KO',
 'COST',
 'PEP',
 'AVGO',
 'LLY',
 'WMT',
 'CSCO',
 'MRK',
 'DIS',
 'VZ',
 'ABT',
 'TMO',
 'CMCSA',
 'ACN',
 'ADBE',
 'MCD',
 'INTC',
 'WFC',
 'CRM',
 'BMY',
 'DHR',
 'PM',
 'LIN',
 'TXN',
 'NKE',
 'QCOM',
 'UNP',
 'RTX',
 'NEE',
 'MDT',
 'AMGN',
 'AMD',
 'T',
 'LOW',
 'UPS',
 'CVS',
 'SPGI',
 'HON',
 'PLD',
 'IBM',
 'ELV',
 'INTU',
 'COP',
 'ORCL',
 'MS',
 'AMT',
 'CAT',
 'TGT',
 'AXP',
 'LMT',
 'DE',
 'GS',
 'SCHW',
 'MO',
 'C',
 'PYPL',
 'AMAT',
 'ADP',
 'BLK',
 'BA',
 'NOW',
 'MDLZ',
 'BKNG',
 'GE',
 'NFLX',
 'ISRG',
 'CB',
 'SBUX',
 'DUK',
 'MMC',
 'ZTS',
 'SYK',
 'MMM',
 'CI',
 'CCI',
 'ADI',
 'SO',
 'GILD',
 'CME']

In [23]:
from polygon import RESTClient
import time #Need to add delays between API request as there is a limit of 5calls/minute
import config
import json
from typing import cast
from urllib3 import HTTPResponse

#Set the constants we need for the API
time_frame='day'
start_date= '2023-01-01'
end_date = '2023-12-31'

#Initialise the API client
client = RESTClient(config.API_KEY)
batch_size=5

#Configure logging
logging.basicConfig(
    level=logging.INFO, #Set the logging level
    format="%(asctime)s - %(levelname)s - %(message)s", #Define log format
    handlers=[
        logging.FileHandler("app.log"), #Log to a file
        logging.StreamHandler() #Log to the console
    ]
)


def fetch_batch(batch):

    try:
        data=[]
        #Requests data for a ticker one by one.
        for ticker in batch:
            print(f"Fetching data for {ticker}...")
            aggregate = client.get_aggs(
                    ticker,
                    1,  #Aggregation multiplier
                    time_frame, 
                    start_date,
                    end_date,
                    raw = True #Requests raw JSON response
                    )
            #Pase the JSON response
            response_data = json.loads(aggregate.data)

            
            if response_data:
                #appends the relevant data we need to a list
                data.append({
                    'ticker':ticker,
                    'results':response_data['results']
                    })
            else:
                print(f"No results found for {ticker}")
        return data
    #Error handling
    except Exception as e:
        logger.error(f"Error fetching data for {batch}: {e}")
        return []

#Retries API call incase of failure due to network issues. 
def fetch_batch_with_rety(batch,retries=2,delay=60.1):
    for attempt in range(retries):
        try:
            return fetch_batch(batch)
        except Exception as e:
            logger.error(f"Error fetching data for batch {batch}:{e}")
            if attempt<retries-1:
                logger.error(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                logger.error(f"Failed after {retries} attempts.")
                return []
def flat_map(entry):
    try:
        #ensure ticker exists and is not a string
        ticker = entry['ticker']
        if not isinstance(ticker,str):
            raise TypeError(f"Expected 'ticker' to be string, got {type(ticker)}")
        #Process the data field
        if 'data' not in entry or not isinstance(entry['data'],list):
            raise ValueError(f"Expected 'data' to be a list but got {type(entry.get('data'))}")
        
        result = []
        for item in entry['data']:
            c = float(item.get("c",0))
            t = item.get('t',0)


            if not isinstance(t,int) or t<0:
                raise ValueError(f"Invalid timestamp value for ticker {ticker}:{t}. Expected a non-negative integer.")

            result.append((ticker,c,t))
        logging.info(f"Sucessfully processed ticker: {ticker} with {len(entry['data'])} records.")
        return result

    except KeyError as e:
        logging.error(f"Missing key in entry: {entry}, KeyError: {e}")
    except ValueError as e:
        logging.error(f"Value Error in entry: {entry}, ValueError: {e}")
    except TypeError as e:
        logging.error(f"Type Error in entry: {entry}, TypeError: {e}")
    except Exception as e:
        logging.critical(f"Unexpected Error in processing: {entry}, Error as {e}", exc_info=True)
    #Return empty list if there is an error    
    return []


def process_all_tickers_and_write(symbols_to_analyse):
    #Processes all tickers by fetching it in batches
    #I'll ge the list to hold a 1000records then I do a write to disk and cleam memory.
    write_batch_size=1000
    accumulated_result = []
    for i in range(0,len(symbols_to_analyse), batch_size):
        #Iterate through tickers in batches of 5.
        batch=symbols_to_analyse[i:i+batch_size]
        #Call above function on each batch
        batch_data = fetch_batch(batch)

        #Ticker data should in a dictionary format
        for ticker_data in batch_data:
            if not isinstance(ticker_data,dict):
                print(f"Unexpected data format: {ticker_data}")
                continue
            
            #Get the ticker symbol and the associated data as per API definition
            ticker=ticker_data.get("ticker", "")
            results=ticker_data.get("results",[])
            #Create a record containing the ticker symbol and its associated results.
            #Organises data + maintains a consistent format. 
            record={
                "ticker":ticker,
                "data":results,
            }

            
            flattened_result=flat_map(record)
            if flattened_result:
                accumulated_result.extend(flattened_result)

            #write to disk if list exceeds threshold, this prevent OOM errors. 
            if len(accumulated_result)>=write_batch_size:
                write_to_parquet(accumulated_result)
                accumulated_result.clear()

        if accumulated_result:
            writer.write_to_parquet(accumulated_result)

        print("Processing of a batch complete")
        
        print("Waiting 60.1secs")
        time.sleep(60.1) #Sleep for 1 minute

    return

class DynamicWriter():
    def __init__(self, file_path):
        self.file_path=file_path
        self.is_first_write=True

    
    def write_to_parquet(self, flattened_result):
        #Define schema of the dataframe.
        schema=StructType(
            [
                StructField("ticker",StringType(),True),
                StructField("c",DoubleType(),True),
                StructField("t",LongType(),True)
            ]
        )


        write_mode="overwrite" if self.is_first_write else "append"
        spark_df = spark.createDataFrame(flattened_result,schema)
        spark_df.write.mode(write_mode).parquet(self.file_path)
        self.is_first_write=False
        print(f"Data written with {write_mode}")

file_path="output_file.parquet"
writer=DynamicWriter(file_path)
process_all_tickers_and_write(symbols_to_analyse)

Fetching data for AAPL...
Fetching data for MSFT...
Fetching data for AMZN...
Fetching data for TSLA...
Fetching data for GOOGL...


2024-12-04 23:06:09,193 - INFO - Sucessfully processed ticker: AAPL with 250 records.
2024-12-04 23:06:09,193 - INFO - Sucessfully processed ticker: MSFT with 250 records.
2024-12-04 23:06:09,193 - INFO - Sucessfully processed ticker: AMZN with 250 records.
2024-12-04 23:06:09,193 - INFO - Sucessfully processed ticker: TSLA with 250 records.
2024-12-04 23:06:18,756 - INFO - Sucessfully processed ticker: GOOGL with 250 records.


Data written with overwrite
Processing of a batch complete
Waiting 60.1secs
Fetching data for GOOG...
Fetching data for BRK.B...
Fetching data for JNJ...
Fetching data for UNH...
Fetching data for NVDA...


2024-12-04 23:07:28,179 - INFO - Sucessfully processed ticker: GOOG with 250 records.
2024-12-04 23:07:28,195 - INFO - Sucessfully processed ticker: BRK.B with 250 records.
2024-12-04 23:07:28,197 - INFO - Sucessfully processed ticker: JNJ with 250 records.
2024-12-04 23:07:36,372 - INFO - Sucessfully processed ticker: UNH with 250 records.
2024-12-04 23:07:36,372 - INFO - Sucessfully processed ticker: NVDA with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for META...
Fetching data for PG...
Fetching data for JPM...
Fetching data for XOM...
Fetching data for V...


2024-12-04 23:08:47,943 - INFO - Sucessfully processed ticker: META with 250 records.
2024-12-04 23:08:47,943 - INFO - Sucessfully processed ticker: PG with 250 records.
2024-12-04 23:08:55,557 - INFO - Sucessfully processed ticker: JPM with 250 records.
2024-12-04 23:08:55,557 - INFO - Sucessfully processed ticker: XOM with 250 records.
2024-12-04 23:08:55,565 - INFO - Sucessfully processed ticker: V with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for HD...
Fetching data for CVX...
Fetching data for MA...
Fetching data for ABBV...
Fetching data for PFE...


2024-12-04 23:10:06,165 - INFO - Sucessfully processed ticker: HD with 250 records.
2024-12-04 23:10:13,386 - INFO - Sucessfully processed ticker: CVX with 250 records.
2024-12-04 23:10:13,386 - INFO - Sucessfully processed ticker: MA with 250 records.
2024-12-04 23:10:13,386 - INFO - Sucessfully processed ticker: ABBV with 250 records.
2024-12-04 23:10:13,401 - INFO - Sucessfully processed ticker: PFE with 250 records.


Processing of a batch complete
Waiting 60.1secs
Fetching data for BAC...
Fetching data for KO...
Fetching data for COST...
Fetching data for PEP...
Fetching data for AVGO...


2024-12-04 23:11:21,728 - INFO - Sucessfully processed ticker: BAC with 250 records.
2024-12-04 23:11:21,728 - INFO - Sucessfully processed ticker: KO with 250 records.
2024-12-04 23:11:21,728 - INFO - Sucessfully processed ticker: COST with 250 records.
2024-12-04 23:11:21,743 - INFO - Sucessfully processed ticker: PEP with 250 records.
2024-12-04 23:11:30,933 - INFO - Sucessfully processed ticker: AVGO with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for LLY...
Fetching data for WMT...
Fetching data for CSCO...
Fetching data for MRK...
Fetching data for DIS...


2024-12-04 23:12:42,744 - INFO - Sucessfully processed ticker: LLY with 250 records.
2024-12-04 23:12:42,746 - INFO - Sucessfully processed ticker: WMT with 250 records.
2024-12-04 23:12:42,749 - INFO - Sucessfully processed ticker: CSCO with 250 records.
2024-12-04 23:12:50,934 - INFO - Sucessfully processed ticker: MRK with 250 records.
2024-12-04 23:12:50,934 - INFO - Sucessfully processed ticker: DIS with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for VZ...
Fetching data for ABT...
Fetching data for TMO...
Fetching data for CMCSA...
Fetching data for ACN...


2024-12-04 23:14:01,079 - INFO - Sucessfully processed ticker: VZ with 250 records.
2024-12-04 23:14:01,087 - INFO - Sucessfully processed ticker: ABT with 250 records.
2024-12-04 23:14:10,119 - INFO - Sucessfully processed ticker: TMO with 250 records.
2024-12-04 23:14:10,119 - INFO - Sucessfully processed ticker: CMCSA with 250 records.
2024-12-04 23:14:10,130 - INFO - Sucessfully processed ticker: ACN with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for ADBE...
Fetching data for MCD...
Fetching data for INTC...
Fetching data for WFC...
Fetching data for CRM...


2024-12-04 23:15:22,065 - INFO - Sucessfully processed ticker: ADBE with 250 records.
2024-12-04 23:15:29,904 - INFO - Sucessfully processed ticker: MCD with 250 records.
2024-12-04 23:15:29,904 - INFO - Sucessfully processed ticker: INTC with 250 records.
2024-12-04 23:15:29,920 - INFO - Sucessfully processed ticker: WFC with 250 records.
2024-12-04 23:15:29,920 - INFO - Sucessfully processed ticker: CRM with 250 records.


Processing of a batch complete
Waiting 60.1secs
Fetching data for BMY...
Fetching data for DHR...
Fetching data for PM...
Fetching data for LIN...
Fetching data for TXN...


2024-12-04 23:16:41,529 - INFO - Sucessfully processed ticker: BMY with 250 records.
2024-12-04 23:16:41,545 - INFO - Sucessfully processed ticker: DHR with 250 records.
2024-12-04 23:16:41,545 - INFO - Sucessfully processed ticker: PM with 250 records.
2024-12-04 23:16:41,551 - INFO - Sucessfully processed ticker: LIN with 250 records.
2024-12-04 23:16:49,246 - INFO - Sucessfully processed ticker: TXN with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for NKE...
Fetching data for QCOM...
Fetching data for UNP...
Fetching data for RTX...
Fetching data for NEE...


2024-12-04 23:17:59,462 - INFO - Sucessfully processed ticker: NKE with 250 records.
2024-12-04 23:17:59,462 - INFO - Sucessfully processed ticker: QCOM with 250 records.
2024-12-04 23:17:59,478 - INFO - Sucessfully processed ticker: UNP with 250 records.
2024-12-04 23:18:08,259 - INFO - Sucessfully processed ticker: RTX with 250 records.
2024-12-04 23:18:08,259 - INFO - Sucessfully processed ticker: NEE with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for MDT...
Fetching data for AMGN...
Fetching data for AMD...
Fetching data for T...
Fetching data for LOW...


2024-12-04 23:19:19,438 - INFO - Sucessfully processed ticker: MDT with 250 records.
2024-12-04 23:19:19,438 - INFO - Sucessfully processed ticker: AMGN with 250 records.
2024-12-04 23:19:27,075 - INFO - Sucessfully processed ticker: AMD with 250 records.
2024-12-04 23:19:27,075 - INFO - Sucessfully processed ticker: T with 250 records.
2024-12-04 23:19:27,075 - INFO - Sucessfully processed ticker: LOW with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs




Fetching data for UPS...
Fetching data for CVS...
Fetching data for SPGI...
Fetching data for HON...


2024-12-04 23:20:36,588 - INFO - Sucessfully processed ticker: UPS with 250 records.


Fetching data for PLD...


2024-12-04 23:20:46,122 - INFO - Sucessfully processed ticker: CVS with 250 records.
2024-12-04 23:20:46,122 - INFO - Sucessfully processed ticker: SPGI with 250 records.
2024-12-04 23:20:46,122 - INFO - Sucessfully processed ticker: HON with 250 records.
2024-12-04 23:20:46,122 - INFO - Sucessfully processed ticker: PLD with 250 records.


Processing of a batch complete
Waiting 60.1secs
Fetching data for IBM...
Fetching data for ELV...
Fetching data for INTU...
Fetching data for COP...


2024-12-04 23:21:56,330 - INFO - Sucessfully processed ticker: IBM with 250 records.
2024-12-04 23:21:56,330 - INFO - Sucessfully processed ticker: ELV with 250 records.
2024-12-04 23:21:56,330 - INFO - Sucessfully processed ticker: INTU with 250 records.
2024-12-04 23:21:56,330 - INFO - Sucessfully processed ticker: COP with 250 records.


Fetching data for ORCL...


2024-12-04 23:22:07,700 - INFO - Sucessfully processed ticker: ORCL with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for MS...
Fetching data for AMT...
Fetching data for CAT...
Fetching data for TGT...
Fetching data for AXP...


2024-12-04 23:23:19,287 - INFO - Sucessfully processed ticker: MS with 250 records.
2024-12-04 23:23:19,294 - INFO - Sucessfully processed ticker: AMT with 250 records.
2024-12-04 23:23:19,294 - INFO - Sucessfully processed ticker: CAT with 250 records.
2024-12-04 23:23:27,819 - INFO - Sucessfully processed ticker: TGT with 250 records.
2024-12-04 23:23:27,819 - INFO - Sucessfully processed ticker: AXP with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for LMT...
Fetching data for DE...
Fetching data for GS...
Fetching data for SCHW...
Fetching data for MO...


2024-12-04 23:24:39,364 - INFO - Sucessfully processed ticker: LMT with 250 records.
2024-12-04 23:24:39,364 - INFO - Sucessfully processed ticker: DE with 250 records.
2024-12-04 23:24:47,081 - INFO - Sucessfully processed ticker: GS with 250 records.
2024-12-04 23:24:47,081 - INFO - Sucessfully processed ticker: SCHW with 250 records.
2024-12-04 23:24:47,081 - INFO - Sucessfully processed ticker: MO with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for C...
Fetching data for PYPL...
Fetching data for AMAT...
Fetching data for ADP...
Fetching data for BLK...


2024-12-04 23:25:57,696 - INFO - Sucessfully processed ticker: C with 250 records.
2024-12-04 23:26:06,166 - INFO - Sucessfully processed ticker: PYPL with 250 records.
2024-12-04 23:26:06,166 - INFO - Sucessfully processed ticker: AMAT with 250 records.
2024-12-04 23:26:06,166 - INFO - Sucessfully processed ticker: ADP with 250 records.
2024-12-04 23:26:06,166 - INFO - Sucessfully processed ticker: BLK with 250 records.


Processing of a batch complete
Waiting 60.1secs
Fetching data for BA...
Fetching data for NOW...
Fetching data for MDLZ...
Fetching data for BKNG...


2024-12-04 23:27:18,088 - INFO - Sucessfully processed ticker: BA with 250 records.
2024-12-04 23:27:18,088 - INFO - Sucessfully processed ticker: NOW with 250 records.
2024-12-04 23:27:18,088 - INFO - Sucessfully processed ticker: MDLZ with 250 records.


Fetching data for GE...


2024-12-04 23:27:18,103 - INFO - Sucessfully processed ticker: BKNG with 250 records.
2024-12-04 23:27:26,553 - INFO - Sucessfully processed ticker: GE with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for NFLX...
Fetching data for ISRG...
Fetching data for CB...
Fetching data for SBUX...
Fetching data for DUK...


2024-12-04 23:28:38,364 - INFO - Sucessfully processed ticker: NFLX with 250 records.
2024-12-04 23:28:38,364 - INFO - Sucessfully processed ticker: ISRG with 250 records.
2024-12-04 23:28:38,364 - INFO - Sucessfully processed ticker: CB with 250 records.
2024-12-04 23:28:46,411 - INFO - Sucessfully processed ticker: SBUX with 250 records.
2024-12-04 23:28:46,411 - INFO - Sucessfully processed ticker: DUK with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for MMC...
Fetching data for ZTS...
Fetching data for SYK...
Fetching data for MMM...


2024-12-04 23:29:56,797 - INFO - Sucessfully processed ticker: MMC with 250 records.
2024-12-04 23:29:56,813 - INFO - Sucessfully processed ticker: ZTS with 250 records.


Fetching data for CI...


2024-12-04 23:30:04,709 - INFO - Sucessfully processed ticker: SYK with 250 records.
2024-12-04 23:30:04,709 - INFO - Sucessfully processed ticker: MMM with 250 records.
2024-12-04 23:30:04,709 - INFO - Sucessfully processed ticker: CI with 250 records.


Data written with append
Processing of a batch complete
Waiting 60.1secs
Fetching data for CCI...
Fetching data for ADI...
Fetching data for SO...
Fetching data for GILD...
Fetching data for CME...


2024-12-04 23:31:18,521 - INFO - Sucessfully processed ticker: CCI with 250 records.
2024-12-04 23:31:28,312 - INFO - Sucessfully processed ticker: ADI with 250 records.
2024-12-04 23:31:28,326 - INFO - Sucessfully processed ticker: SO with 250 records.
2024-12-04 23:31:28,326 - INFO - Sucessfully processed ticker: GILD with 250 records.
2024-12-04 23:31:28,326 - INFO - Sucessfully processed ticker: CME with 250 records.


Processing of a batch complete
Waiting 60.1secs


In [21]:
def read_data(input_file="output_file.parquet"):
    df=spark.read.parquet(input_file)
    date_col = date_format(from_unixtime(col("t")/1000),"yyyy-MM-dd")

    df = df.withColumn("date",date_col)
    df=df.drop(col("t"))
    df = df.withColumnRenamed("c","close_price")
    return df
df = read_data()
df.select(col("ticker")).distinct().show()

+------+
|ticker|
+------+
|    SO|
|   CME|
|   ADI|
|  GILD|
+------+



In [None]:

def calculate_price_difference( df, ticker_col='ticker', date_col='date',price_col='close_price'):
    #1) Repartition by ticker means grouping/aggregations can be done locally (per ticker). This reduces shuffling and maximises parrallelism.
    #data frame is partitioned and sorted for group operations
    #2) Order by is needed due to make fetching the first and last prices are correct and efficient. 
    #3) Round, filtering, percent calcs completed in lower number of transformtions. Increases readability.
    #4) Wrapping function allows reusuability, modularity, paramaterisation (for different datasets in other code bases), can write unit tests for small datasets
    #and see if it has expected results.

    df = df.repartition(ticker_col).orderBy(ticker_col,date_col)

    window_spec = Window.partitionBy("ticker").orderBy(date_col).rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
    
    df_diff=df.withColumn("price_first", F.first(price_col).over(window_spec)) \
                .withColumn("price_last", F.last(price_col).over(window_spec)) \
                .withColumn("date_first",F.first(date_col).over(window_spec)) \
                .withColumn("date_last",F.last(date_col).over(window_spec)) \
                .filter((F.col(date_col) == F.col("date_first")) | (F.col(date_col) == F.col("date_last")))
    
    
    df_diff=df_diff.withColumn("price_diff", F.round(F.col("price_last")-F.col("price_first"),2)) \
                .withColumn("price_diff_percent", F.round(col("price_diff")*100/col("price_first"),2))
    
    df_diff_percent=df_diff.select("ticker", "date_first","date_first", "price_first", "date_last","price_last","price_diff","price_diff_percent"). \
                    distinct(). \
                    orderBy(F.col("price_diff_percent").desc())
    
    return df_diff_percent

#####################################################

In [None]:
###############################
#QUESTION 1 ANSWER: Which stock has had the greatest relative increase in price in this period? 'NVDA'
###############################

def greatest_relative_increase(df_diff_percent):
    #Get the first row and stock name.
    stock_name = df_diff_percent.collect()[0]["ticker"]
    stock_name =df_stock_name.filter(col("symbol")==stock_name).show(truncate=False)
    return stock_name
    
df_diff_percent= calculate_price_difference(df)
df_diff_percent.show(truncate=False)
greatest_relative_increase(df_diff_percent)

In [133]:
#Start Question 2

In [134]:
#Find how much this percentage increase would increase your intial portfolio
#At the end of the year.
def final_portfolio_value(df_diff_percent):
    #Find the average percentage increase over the group of stocks in 2023.
    df_diff_percent_average=df_diff_percent.agg(F.avg("price_diff_percent").alias("avg_percent_gain"))
    df_diff_percent_average=df_diff_percent_average.collect()[0]["avg_percent_gain"]
    
    initial_portfolio_value=1000000
    final_portfolio_value=initial_portfolio_value*(1+df_diff_percent_average/100)
    final_portfolio_value = round(final_portfolio_value,2)
    return final_portfolio_value
#QUESTION 2 ANSWER: If you had invested $1 million at the beginning of this period by purchasing $10,000 worth of shares in every company in the list equally, 
#how much would you have today? Technical note, you can assume that it is possible 
#to purchase fractional shares. Ans: $1682181.14
final_portfolio_value(df_diff_percent)

In [137]:
#Start Question 3

In [171]:
def monthly_cagr(df, start_month, end_month):

    #Filter dataframe for data on these dates. Make 2 seperate dataframes for each of these dates. 
    df_start=df.filter(col("date")==start_month)
    df_start=df_start.withColumnRenamed("close_price","start_price")
    df_start=df_start.withColumnRenamed("date","start_month")
    if df_start.isEmpty():
        raise ValueError("df_start dataset is empty")

    #Shows the end date and end prince we are interested in. 
    df_end=df.filter(col("date")==end_month)
    df_end=df_end.withColumnRenamed("close_price","end_price")
    df_end=df_end.withColumnRenamed("date","end_month")
    if df_end.isEmpty():
        raise ValueError("df_end dataset is empty")

    #Join these dataframes on primary key ticker. 
    df_jan_jun=df_start.join(df_end,on="ticker")

    #For extensibilty for the month constant in below calc
    from datetime import datetime
    start_date=datetime.strptime(start_month,"%Y-%m-%d")
    end_date=datetime.strptime(end_month,"%Y-%m-%d")
    from dateutil.relativedelta import relativedelta
    difference=relativedelta(end_date,start_date)
    difference_months=difference.years*12+difference.months

    #calculate the monthly Compounded annual growth rate.
    #Assumption of inclusive of January and June months
    end_val = col("end_price")
    start_val = col("start_price")
    months = difference_months+1
    
    calculation = ((end_val/start_val)**(1/months))-1
    df_jan_jun=df_jan_jun.withColumn("CAGR over defined period", calculation)

    #Sort datafram in "percent_gain" decending order
    df_jan_jun=df_jan_jun.sort(col("CAGR over defined period"), ascending=False)

    #Pick the top result
    stock_with_greatest_monthly_CAGR=df_jan_jun.collect()[0]["ticker"]
    stock_with_greatest_monthly_CAGR=df_stock_name.filter(col("symbol")==stock_with_greatest_monthly_CAGR)
    stock_with_greatest_monthly_CAGR.show(truncate=False)
    return print(stock_with_greatest_monthly_CAGR.collect()[0]["company_name"])
    
df = read_data()
stock_with_greatest_monthly_CAGR=monthly_cagr(df, start_month="2023-01-03", end_month="2023-06-30")

+------------+------+
|company_name|symbol|
+------------+------+
|Apple Inc.  |AAPL  |
+------------+------+

Apple Inc.


In [183]:
#Start Question 4
def greatest_decrease_in_price(df):

    df_week = df.withColumn("week_start",F.date_trunc("week",col("date")))
    #Window partition by ticker and week
    window_spec=Window.partitionBy("ticker", "week_start").orderBy("week_start")
    df_week=df_week.withColumn("start_price",F.first("close_price").over(window_spec)) \
                .withColumn("end_price",F.last("close_price").over(window_spec))
    df_week_final=df_week.select("ticker","week_start","start_price","end_price").distinct()
    df_week_final=df_week_final.withColumn("price_diff",F.round(F.col("end_price")-F.col("start_price"),2))
    df_week_final = df_week_final.sort(col("price_diff"),ascending=True)
    df_week_final.show()
    greatest_price_drop_stock = df_week_final.collect()[0]["ticker"]
    greatest_price_drop_week = df_week_final.collect()[0]["week_start"]
    print(greatest_price_drop_stock)
    print(greatest_price_drop_week)
    greatest_price_drop_stock_name= df_stock_name.filter(col("symbol")==greatest_price_drop_stock)
    greatest_price_drop_stock_name.show()
    return greatest_price_drop_stock_name.collect()[0]["company_name"]
greatest_decrease_in_price(df)

+------+-------------------+-----------+---------+----------+
|ticker|         week_start|start_price|end_price|price_diff|
+------+-------------------+-----------+---------+----------+
|  MSFT|2023-01-02 00:00:00|     239.58|   224.93|    -14.65|
|  AAPL|2023-07-31 00:00:00|     196.45|   181.99|    -14.46|
|  MSFT|2023-02-13 00:00:00|     271.32|   258.06|    -13.26|
|  MSFT|2023-09-18 00:00:00|     329.06|   317.01|    -12.05|
|  AAPL|2023-09-04 00:00:00|      189.7|   178.18|    -11.52|
|  MSFT|2023-06-05 00:00:00|     335.94|   326.79|     -9.15|
|  MSFT|2023-08-07 00:00:00|     330.11|   321.01|      -9.1|
|  MSFT|2023-03-06 00:00:00|     256.87|   248.59|     -8.28|
|  MSFT|2023-07-31 00:00:00|     335.92|   327.78|     -8.14|
|  MSFT|2023-09-11 00:00:00|     337.94|   330.22|     -7.72|
|  MSFT|2023-08-14 00:00:00|     324.04|   316.48|     -7.56|
|  MSFT|2023-07-24 00:00:00|     345.11|   338.37|     -6.74|
|  MSFT|2023-10-16 00:00:00|     332.64|   326.67|     -5.97|
|  AAPL|

'Microsoft Corporation'

In [197]:
def greatest_percent_drop_stock(df):
    df_week = df.withColumn("week_start",F.date_trunc("week",col("date")))
    #Window partition by ticker and week
    window_spec=Window.partitionBy("ticker", "week_start").orderBy("week_start")
    df_week=df_week.withColumn("start_price",F.first("close_price").over(window_spec)) \
                  .withColumn("end_price",F.last("close_price").over(window_spec))
    df_week_final=df_week.select("ticker","week_start","start_price","end_price").distinct()
    df_week_final=df_week_final.withColumn("price_diff",F.round(F.col("end_price")-F.col("start_price"),2))
    calc= (col("price_diff"))*100/col("start_price")
    df_week_final_percent=df_week_final.withColumn("percent_drop",calc). \
                            sort(col("percent_drop"),ascending=True)
    
    
    greatest_percent_drop_stock = df_week_final_percent.collect()[0]["ticker"]    
    greatest_percent_drop_week =df_week_final_percent.collect()[0]["week_start"]
    
    print(greatest_percent_drop_week)
    
    greatest_percent_drop_stock_full_name = df_stock_name.filter(col("symbol")==greatest_percent_drop_stock)
    greatest_percent_drop_stock_full_name = greatest_percent_drop_stock_full_name.collect()[0]["company_name"]
    print(greatest_percent_drop_stock_full_name)
    return
    
greatest_percent_drop_stock(df)

2023-07-31 00:00:00
Apple Inc.
