**Note** : In AWS Athena, you must make database, sp500_db_final, with the following command in query editor first:
>CREATE DATABASE sp500_db_final

####  Run this cell to set up and start your interactive session.


In [1]:
%stop_session
%idle_timeout 15
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from pyspark.sql.functions import *
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job

import boto3 
from pyspark.sql.functions import monotonically_increasing_id 
from pyspark.sql.window import Window 
from pyspark.sql.functions import row_number
from pyspark.sql.types import DecimalType

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Stopping session: 0fc21c48-db9f-4b09-ac56-62b47cebcfc1
Stopped session.
Current idle_timeout is 15 minutes.
idle_timeout has been set to 15 minutes.
Setting Glue version to: 4.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 5
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 15
Session ID: 4c9ecdce-f987-4e3d-b8f5-3d2382df7244
Applying the following default arguments:
--glue_kernel_version 1.0.10
--enable-glue-datacatalog true
Waiting for session 4c9ecdce-f987-4e3d-b8f5-3d2382df7244 to get into ready status...
Session 4c9ecdce-f987-4e3d-b8f5-3d2382df7244 has been created.



## Final - Handle SP500 Market Cap Estimations

In [2]:
# Define S3 bucket and path 
bucket_name = 'sp500-historical-analysis-project' 
file_path = 'final_data/sp500_total_market_cap.csv' 

# Create DynamicFrame from JSON files in S3 
dyf_original = glueContext.create_dynamic_frame.from_options( connection_type="s3", 
                                                    connection_options={"paths": [f"s3://{bucket_name}/{file_path}"]}, 
                                                    format="csv",
                                                    format_options={'withHeader': True})




In [3]:
dyf_original.printSchema()

root
|-- Date: string
|-- Closing Index Value: string
|-- Total Market Capitalization: string


In [5]:
df = dyf_original.toDF() #convert Glue DF to PySpark DF

#handle dates
df = df.withColumn("Date", to_date(df["Date"], "yyyy-MM-dd")) # Convert to date type 
df = df.withColumnRenamed("Date", "date")
# Extract year, month, and day 
df = df.withColumn("year", year(df["date"])) 
df = df.withColumn("month", month(df["date"])) 
df = df.withColumn("day", dayofmonth(df["date"]))


df = df.withColumn("Closing Index Value", col("Closing Index Value").cast(DecimalType(20, 2)))
df = df.withColumn("Total Market Capitalization", col("Total Market Capitalization").cast(DecimalType(20, 2)))

df = df.orderBy("date")
df.printSchema()

root
 |-- date: date (nullable = true)
 |-- Closing Index Value: decimal(20,2) (nullable = true)
 |-- Total Market Capitalization: decimal(20,2) (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)


In [7]:
df.count() #7043 rows expected

7043


In [8]:
dyf = DynamicFrame.fromDF(df, glueContext, "dyf") # Convert Spark DataFrame to Glue DynamicFrame 
dyf.printSchema() # Show the schema of the DynamicFrame 

root
|-- date: date
|-- Closing Index Value: decimal
|-- Total Market Capitalization: decimal
|-- year: int
|-- month: int
|-- day: int


In [10]:
#Delete everything in output file first to prevent conflict errors
import boto3 

s3 = boto3.client('s3')
# Define the bucket and prefix (folder path)
bucket_name = 'sp500-historical-analysis-project'
prefix = 'PARQUET_sp500_daily_expectations_from_index_values'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) # List all objects in the specified prefix
delete_keys = [{'Key': obj['Key']} for obj in response.get('Contents', [])] # Collect all object keys to delete
# Delete all objects
if delete_keys:
    s3.delete_objects(Bucket=bucket_name, Delete={'Objects': delete_keys})

print(f"Deleted {len(delete_keys)} objects from {bucket_name}/{prefix}")

Deleted 0 objects from sp500-historical-analysis-project/PARQUET_sp500_daily_expectations_from_index_values


In [11]:
s3output = glueContext.getSink(
  path='s3://sp500-historical-analysis-project/PARQUET_sp500_daily_expectations_from_index_values/',
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="sp500_db_final", catalogTableName="PARQUET_sp500_marketcap_expectations"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf)

<awsglue.dynamicframe.DynamicFrame object at 0x7fb864daee30>


## Final - Handle Trading Days

In [2]:
# Define S3 bucket and path 
bucket_name = 'sp500-historical-analysis-project' 
file_path = 'final_data/trading_days_with_constituents.csv' 

# Create DynamicFrame from JSON files in S3 
dyf_original = glueContext.create_dynamic_frame.from_options( connection_type="s3", 
                                                    connection_options={"paths": [f"s3://{bucket_name}/{file_path}"]}, 
                                                    format="csv",
                                                    format_options={'withHeader': True})




In [7]:
df = dyf_original.toDF() #convert Glue DF to PySpark DF

#handle dates
df = df.withColumn("trading_date", to_date(df["trading_date"], "yyyy-MM-dd")) # Convert to date type 
# Extract year, month, and day 
df = df.withColumn("year", year(df["trading_date"])) 
df = df.withColumn("month", month(df["trading_date"])) 
df = df.withColumn("day", dayofmonth(df["trading_date"]))
df = df.withColumn("num_constituents", col("number_of_constituents").cast("int"))
df = df.drop("date_range_start", "date_range_end", "number_of_constituents")
df = df.orderBy("trading_date")
df.printSchema()

root
 |-- trading_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- num_constituents: integer (nullable = true)


In [6]:
# df.filter(col("num_constituents").isNull()).show()
# df.filter(col("trading_date").isNull()).show()

In [11]:
df.count() 

7044


In [12]:
dyf = DynamicFrame.fromDF(df, glueContext, "dyf") # Convert Spark DataFrame to Glue DynamicFrame 
dyf.printSchema() # Show the schema of the DynamicFrame 

root
|-- trading_date: date
|-- year: int
|-- month: int
|-- day: int
|-- num_constituents: int


In [13]:
#Delete everything in output file first to prevent conflict errors
import boto3 

s3 = boto3.client('s3')
# Define the bucket and prefix (folder path)
bucket_name = 'sp500-historical-analysis-project'
prefix = 'PARQUET_final_trading_days/'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) # List all objects in the specified prefix
delete_keys = [{'Key': obj['Key']} for obj in response.get('Contents', [])] # Collect all object keys to delete
# Delete all objects
if delete_keys:
    s3.delete_objects(Bucket=bucket_name, Delete={'Objects': delete_keys})

print(f"Deleted {len(delete_keys)} objects from {bucket_name}/{prefix}")

Deleted 0 objects from sp500-historical-analysis-project/PARQUET_final_trading_days/


In [14]:
s3output = glueContext.getSink(
  path='s3://sp500-historical-analysis-project/PARQUET_final_trading_days/',
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="sp500_db_final", catalogTableName="PARQUET_trading_days"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf)

<awsglue.dynamicframe.DynamicFrame object at 0x7f9293819cc0>


## Final - Handle Company Profiles

In [15]:
# Define S3 bucket and path 
bucket_name = 'sp500-historical-analysis-project' 
folder_path = 'final_data/company_profiles/' 

# Create DynamicFrame from JSON files in S3 
dyf_original = glueContext.create_dynamic_frame.from_options( connection_type="s3", 
                                                    connection_options={"paths": [f"s3://{bucket_name}/{folder_path}"], "recurse": True}, 
                                                    format="json")




In [20]:
spark_df = dyf_original.toDF() #convert Glue DF to PySpark DF

#get relevant data from file name
spark_df = spark_df.withColumn('S3_filename_path', input_file_name()) #get original file location for data
spark_df = spark_df.withColumn("filename", split(col('S3_filename_path'), "/").getItem(5))
spark_df = spark_df.withColumn("temp", split(col("filename"), ".json").getItem(0)) 
spark_df = spark_df.withColumn("index", split(col("temp"), "_").getItem(0)) 
spark_df = spark_df.withColumn("index", col("index").cast("int"))
spark_df = spark_df.drop("temp", "filename", "S3_filename_path")

spark_df = spark_df.withColumn("company_details", col("description").cast("string"))
spark_df = spark_df.drop('description') #Note: capitalization is not considered by spark

spark_df = spark_df.orderBy("index") 
spark_df.show(2)

+------+------------+-----------------+--------------------+-----------+------+--------+-----+--------------------+
|ticker|company_name|           sector|            industry|is_delisted|source|exchange|index|     company_details|
+------+------------+-----------------+--------------------+-----------+------+--------+-----+--------------------+
|   CRH|     CRH plc|  Basic Materials|Construction Mate...|      false|   fmp|    NYSE|    0|CRH plc, together...|
|  CVNA| Carvana Co.|Consumer Cyclical|  Auto - Dealerships|      false|   fmp|    NYSE|    1|Carvana Co., toge...|
+------+------------+-----------------+--------------------+-----------+------+--------+-----+--------------------+
only showing top 2 rows


In [18]:
spark_df.count(), dyf_original.count() #should match

(1177, 1177)


In [19]:
spark_df.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- sector: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- is_delisted: boolean (nullable = true)
 |-- source: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- index: integer (nullable = true)
 |-- company_details: string (nullable = true)


In [24]:
dyf = DynamicFrame.fromDF(spark_df, glueContext, "dyf") # Convert Spark DataFrame to Glue DynamicFrame 
dyf.printSchema() # Show the schema of the DynamicFrame 

root
|-- ticker: string
|-- company_name: string
|-- sector: string
|-- industry: string
|-- is_delisted: boolean
|-- source: string
|-- exchange: string
|-- index: int
|-- company_details: string


In [25]:
#Delete everything in output file first to prevent conflict errors
import boto3 

s3 = boto3.client('s3')
# Define the bucket and prefix (folder path)
bucket_name = 'sp500-historical-analysis-project'
prefix = 'PARQUET_final_company_profiles/'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) # List all objects in the specified prefix
delete_keys = [{'Key': obj['Key']} for obj in response.get('Contents', [])] # Collect all object keys to delete
# Delete all objects
if delete_keys:
    s3.delete_objects(Bucket=bucket_name, Delete={'Objects': delete_keys})

print(f"Deleted {len(delete_keys)} objects from {bucket_name}/{prefix}")

Deleted 0 objects from sp500-historical-analysis-project/PARQUET_final_company_profiles/


In [26]:
s3output = glueContext.getSink(
  path='s3://sp500-historical-analysis-project/PARQUET_final_company_profiles/',
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="sp500_db_final", catalogTableName="PARQUET_company_profiles"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf)

<awsglue.dynamicframe.DynamicFrame object at 0x7f92930bf850>


## Handle Market Cap Data

In [3]:
#using glueContext.create_dynamic_frame.from_options() loses json data (3310722 vs 3311837 expected)
#about 2-3 minutes to run
raw_df = spark.read.option("multiline", "true").json("s3://sp500-historical-analysis-project/final_data/company_market_cap_data/")




In [4]:
raw_df.count() #3,469,774 rows

3469774


In [5]:
#get relevant data from file name
df = raw_df
df = df.withColumn('S3_filename_path', input_file_name()) #get original file location for data
df = df.withColumn("filename", split(col('S3_filename_path'), "/").getItem(5))
df = df.withColumn("temp", split(col("filename"), ".json").getItem(0)) 
df = df.withColumn("index", split(col("temp"), "_").getItem(0)) 
df = df.withColumn("index", col("index").cast("int"))
df = df.withColumn("ticker", split(col("temp"), "_").getItem(1))
df = df.drop("temp", "filename", "S3_filename_path")

#handle dates
df = df.withColumn("date", to_date(df["date"], "yyyy-MM-dd")) # Convert to date type 
# Extract year, month, and day 
df = df.withColumn("year", year(df["date"])) 
df = df.withColumn("month", month(df["date"])) 
df = df.withColumn("day", dayofmonth(df["date"]))

#handle marketcap to only have 2 decimals
df = df.withColumn("market_cap", col("market_cap").cast(DecimalType(20, 2)))

#sort the spark df by date; this will cut writing time by over 90%!!!
df = df.orderBy("year", "month", "day", "index") 
df.show(5)

+----------+-------------+-----+------+----+-----+---+
|      date|   market_cap|index|ticker|year|month|day|
+----------+-------------+-----+------+----+-----+---+
|1998-01-02|7225245173.09|  136|   BKR|1998|    1|  2|
|1998-01-02|7897793176.00|  151|     D|1998|    1|  2|
|1998-01-02|3461966943.69|  206|   HUM|1998|    1|  2|
|1998-01-02|3082839627.78|  222|   FCX|1998|    1|  2|
|1998-01-02|6488518654.73|  294|   TMO|1998|    1|  2|
+----------+-------------+-----+------+----+-----+---+
only showing top 5 rows


In [6]:
df.count()

3469774


In [9]:
#check for duplicates
# Check for duplicates across all columns
total_rows = df.count()
distinct_rows = df.distinct().count()

if total_rows == distinct_rows:
    print("No duplicate rows found.")
else:
    print(f"Duplicates found: {total_rows - distinct_rows} duplicate rows.")

No duplicate rows found.


In [10]:
dyf = DynamicFrame.fromDF(df, glueContext, "dyf") # Convert Spark DataFrame to Glue DynamicFrame 
dyf.printSchema() # Show the schema of the DynamicFrame 

root
|-- date: date
|-- market_cap: decimal
|-- index: int
|-- ticker: string
|-- year: int
|-- month: int
|-- day: int


In [11]:
dyf.count()

3469774


In [13]:
#Delete everything in output file first to prevent conflict errors
import boto3 

s3 = boto3.client('s3')
# Define the bucket and prefix (folder path)
bucket_name = 'sp500-historical-analysis-project'
prefix = 'PARQUET_final_company_market_cap_data/'


response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix) # List all objects in the specified prefix
delete_keys = [{'Key': obj['Key']} for obj in response.get('Contents', [])] # Collect all object keys to delete
# Delete all objects
if delete_keys:
    s3.delete_objects(Bucket=bucket_name, Delete={'Objects': delete_keys})

print(f"Deleted {len(delete_keys)} objects from {bucket_name}/{prefix}")

Deleted 0 objects from sp500-historical-analysis-project/PARQUET_final_company_market_cap_data/


In [14]:
# s3output = glueContext.getSink(
#   path='s3://sp500-historical-analysis-project/PARQUET_final_company_market_cap_data/',
#   connection_type="s3",
#   updateBehavior="UPDATE_IN_DATABASE",
#   partitionKeys=["year", "month"],
#   compression="snappy",
#   enableUpdateCatalog=True,
#   transformation_ctx="s3output",
# )

#try without partitions
s3output = glueContext.getSink(
  path='s3://sp500-historical-analysis-project/PARQUET_final_company_market_cap_data/',
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)

s3output.setCatalogInfo(
  catalogDatabase="sp500_db_final", catalogTableName="PARQUET_marketcaps"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf)

<awsglue.dynamicframe.DynamicFrame object at 0x7f9ee64a2b30>


## Stop code command

In [7]:
%stop_session

Stopping session: b0a97ea2-cbb3-4de6-8809-0f33aafd520d
Stopped session.
