In [13]:
import pandas as pd
import numpy as np
from datetime import date
import yfinance as yf

In [2]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

### Question 1: [Index] S&P 500 Stocks Added to the Index
Which year had the highest number of additions?

Using the list of S&P 500 companies from [Wikipedia's S&P 500 companies page](https://en.wikipedia.org/wiki/List_of_S%26P_500_companies), download the data including the year each company was added to the index.

Hint: you can use pandas.read_html to scrape the data into a DataFrame.

Steps:

* Create a DataFrame with company tickers, names, and the year they were added.
* Extract the year from the addition date and calculate the number of stocks added each year.
* Which year had the highest number of additions (1957 doesn't count, as it was the year when the S&P 500 index was founded)? Write down this year as your answer (the most recent one, if you have several records).

Context:

"Following the announcement, all four new entrants saw their stock prices rise in extended trading on Friday" - recent examples of S&P 500 additions include DASH, WSM, EXE, TKO in 2025 ([Nasdaq article](https://www.nasdaq.com/articles/sp-500-reshuffle-dash-tko-expe-wsm-join-worth-buying)).

Additional: How many current S&P 500 stocks have been in the index for more than 20 years? When stocks are added to the S&P 500, they usually experience a price bump as investors and index funds buy shares following the announcement.

In [7]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
df['year'] = pd.to_datetime(df['Date added']).dt.year

In [17]:
df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,year
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902,1957
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916,2017
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888,1957
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888),2012
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,2011


In [19]:
# number of stocks added each year
df['year'].value_counts().head()

year
1957    53
2016    23
2017    23
2019    22
2008    17
Name: count, dtype: int64

Which year had the highest number of additions?\
-> 2016, 2017 (most recent)

In [30]:
# How many current S&P 500 stocks have been in the index for more than 20 years?
df[(date.today().year - df['year']) > 20]['Symbol'].count()

np.int64(219)

In [32]:
# total
df['Symbol'].count()

np.int64(503)

### Question 2. [Macro] Indexes YTD (as of 1 May 2025)
How many indexes (out of 10) have better year-to-date returns than the US (S&P 500) as of May 1, 2025?

Using Yahoo Finance World Indices data, compare the year-to-date (YTD) performance (1 January-1 May 2025) of major stock market indexes for the following countries:

United States - S&P 500 (^GSPC)\
China - Shanghai Composite (000001.SS)\
Hong Kong - HANG SENG INDEX (^HSI)\
Australia - S&P/ASX 200 (^AXJO)\
India - Nifty 50 (^NSEI)\
Canada - S&P/TSX Composite (^GSPTSE)\
Germany - DAX (^GDAXI)\
United Kingdom - FTSE 100 (^FTSE)\
Japan - Nikkei 225 (^N225)\
Mexico - IPC Mexico (^MXX)\
Brazil - Ibovespa (^BVSP)\
Hint: use start_date='2025-01-01' and end_date='2025-05-01' when downloading daily data in yfinance

Context:

[Global Valuations: Who's Cheap, Who's Not?](https://simplywall.st/article/beyond-the-us-global-markets-after-yet-another-tariff-update) article suggests "Other regions may be growing faster than the US and you need to diversify."

Reference: Yahoo Finance World Indices - https://finance.yahoo.com/world-indices/

Additional: How many of these indexes have better returns than the S&P 500 over 3, 5, and 10 year periods? Do you see the same trend? Note: For simplicity, ignore currency conversion effects.)

YTD(%) = ((latest close - first close) / first close) * 100%

In [22]:
ticker_obj = yf.Ticker("^GSPC")
us_dax_daily = ticker_obj.history(start = '2025-01-01', end = '2025-05-01')
us_ytd = ((us_dax_daily['Close'].values[-1] - us_dax_daily['Close'].values[0]) / us_dax_daily['Close'].values[0]) * 100
us_ytd

np.float64(-5.10330074824504)

In [24]:
ans = 0
better_returns = []
obj_names = ["000001.SS","^HSI","^AXJO","^NSEI","^GSPTSE","^GDAXI","^FTSE","^N225","^MXX","^BVSP"]
for obj_name in obj_names:
    ticker_obj = yf.Ticker(obj_name)
    dax_daily = ticker_obj.history(start = '2025-01-01', end = '2025-05-01')
    obj_ytd = ((dax_daily['Close'].values[-1] - dax_daily['Close'].values[0]) / dax_daily['Close'].values[0]) * 100
    print(obj_name, obj_ytd)
    if obj_ytd > us_ytd:
        ans += 1
        better_returns.append(obj_name)
print(ans, better_returns) 

000001.SS 0.5048172440500021
^HSI 12.720017835921466
^AXJO -0.9145002952478493
^NSEI 2.4904237409784913
^GSPTSE -0.22612571792915093
^GDAXI 12.34637838149953
^FTSE 2.8425901908435485
^N225 -8.297930804200304
^MXX 13.049444457570319
^BVSP 12.438709677419354
9 ['000001.SS', '^HSI', '^AXJO', '^NSEI', '^GSPTSE', '^GDAXI', '^FTSE', '^MXX', '^BVSP']


How many of these indexes have better returns than the S&P 500 over 3, 5, and 10 year periods? Do you see the same trend? 

In [3]:
# For 3 years
ticker_obj = yf.Ticker("^GSPC")
us_dax_daily = ticker_obj.history(start = '2022-01-01', end = '2025-05-01')
us_ytd = ((us_dax_daily['Close'].values[-1] - us_dax_daily['Close'].values[0]) / us_dax_daily['Close'].values[0]) * 100

ans = 0
better_returns = []
obj_names = ["000001.SS","^HSI","^AXJO","^NSEI","^GSPTSE","^GDAXI","^FTSE","^N225","^MXX","^BVSP"]
for obj_name in obj_names:
    ticker_obj = yf.Ticker(obj_name)
    dax_daily = ticker_obj.history(start = '2022-01-01', end = '2025-05-01')
    obj_ytd = ((dax_daily['Close'].values[-1] - dax_daily['Close'].values[0]) / dax_daily['Close'].values[0]) * 100
    if obj_ytd > us_ytd:
        ans += 1
        better_returns.append(obj_name)
print(ans, better_returns) 

5 ['^NSEI', '^GSPTSE', '^GDAXI', '^N225', '^BVSP']


In [4]:
# For 5 years
ticker_obj = yf.Ticker("^GSPC")
us_dax_daily = ticker_obj.history(start = '2020-01-01', end = '2025-05-01')
us_ytd = ((us_dax_daily['Close'].values[-1] - us_dax_daily['Close'].values[0]) / us_dax_daily['Close'].values[0]) * 100

ans = 0
better_returns = []
obj_names = ["000001.SS","^HSI","^AXJO","^NSEI","^GSPTSE","^GDAXI","^FTSE","^N225","^MXX","^BVSP"]
for obj_name in obj_names:
    ticker_obj = yf.Ticker(obj_name)
    dax_daily = ticker_obj.history(start = '2020-01-01', end = '2025-05-01')
    obj_ytd = ((dax_daily['Close'].values[-1] - dax_daily['Close'].values[0]) / dax_daily['Close'].values[0]) * 100
    if obj_ytd > us_ytd:
        ans += 1
        better_returns.append(obj_name)
print(ans, better_returns) 

1 ['^NSEI']


In [5]:
# For 10 years
ticker_obj = yf.Ticker("^GSPC")
us_dax_daily = ticker_obj.history(start = '2015-01-01', end = '2025-05-01')
us_ytd = ((us_dax_daily['Close'].values[-1] - us_dax_daily['Close'].values[0]) / us_dax_daily['Close'].values[0]) * 100

ans = 0
better_returns = []
obj_names = ["000001.SS","^HSI","^AXJO","^NSEI","^GSPTSE","^GDAXI","^FTSE","^N225","^MXX","^BVSP"]
for obj_name in obj_names:
    ticker_obj = yf.Ticker(obj_name)
    dax_daily = ticker_obj.history(start = '2015-01-01', end = '2025-05-01')
    obj_ytd = ((dax_daily['Close'].values[-1] - dax_daily['Close'].values[0]) / dax_daily['Close'].values[0]) * 100
    if obj_ytd > us_ytd:
        ans += 1
        better_returns.append(obj_name)
print(ans, better_returns) 

2 ['^NSEI', '^BVSP']


India - Nifty 50 (^NSEI) is better for all years than SP USA

### Question 3. [Index] S&P 500 Market Corrections Analysis
Calculate the median duration (in days) of significant market corrections in the S&P 500 index.

For this task, define a correction as an event when a stock index goes down by more than 5% from the closest all-time high maximum.

Steps:

* Download S&P 500 historical data (1950-present) using yfinance
* Identify all-time high points (where price exceeds all previous prices)
* For each pair of consecutive all-time highs, find the minimum price in between
* Calculate drawdown percentages: (high - low) / high × 100
* Filter for corrections with at least 5% drawdown
* Calculate the duration in days for each correction period
* Determine the 25th, 50th (median), and 75th percentiles for correction durations

Context:

Investors often wonder about the typical length of market corrections when deciding "when to buy the dip" ([Reddit discussion](https://www.reddit.com/r/investing/comments/1jrqnte/when_are_you_buying_the_dip/?rdt=64135)).
[A Wealth of Common Sense - How Often Should You Expect a Stock Market Correction?](https://awealthofcommonsense.com/2022/01/how-often-should-you-expect-a-stock-market-correction/)\
Hint (use this data to compare with your results): Here is the list of top 10 largest corrections by drawdown:

* 2007-10-09 to 2009-03-09: 56.8% drawdown over 517 days
* 2000-03-24 to 2002-10-09: 49.1% drawdown over 929 days
* 1973-01-11 to 1974-10-03: 48.2% drawdown over 630 days
* 1968-11-29 to 1970-05-26: 36.1% drawdown over 543 days
* 2020-02-19 to 2020-03-23: 33.9% drawdown over 33 days
* 1987-08-25 to 1987-12-04: 33.5% drawdown over 101 days
* 1961-12-12 to 1962-06-26: 28.0% drawdown over 196 days
* 1980-11-28 to 1982-08-12: 27.1% drawdown over 622 days
* 2022-01-03 to 2022-10-12: 25.4% drawdown over 282 days
* 1966-02-09 to 1966-10-07: 22.2% drawdown over 240 days

In [59]:
ticker_obj = yf.Ticker("^GSPC")
sp500 = ticker_obj.history(start = '1950-01-01')

# Identify all-time high points (where price exceeds all previous prices)
# Create a column for previous rolling max (excluding current day)
sp500["prev_max"] = sp500["Close"].cummax().shift(1)
sp500["all_time_high"] = sp500["Close"] > sp500["prev_max"]

In [60]:
# For each pair of consecutive all-time highs, find the minimum price in between
all_time_highs = sp500[sp500["all_time_high"]].copy()
high_dates = all_time_highs.index.tolist()

results = []
for i in range(len(high_dates) - 1):
    start = high_dates[i]
    end = high_dates[i + 1]
    between = sp500.loc[start:end].iloc[1:-1]  # exclude the high dates themselves

    if not between.empty:
        min_row = between["Close"].idxmin()
        results.append({
            "start_high_date": start,
            "next_high_date": end,
            "min_price": sp500.loc[min_row, "Close"],
            "min_price_date": min_row,
            "start_price": sp500.loc[start, "Close"],
        })

min_between_highs = pd.DataFrame(results)
min_between_highs.tail()

Unnamed: 0,start_high_date,next_high_date,min_price,min_price_date,start_price
652,2024-11-11 00:00:00-05:00,2024-11-26 00:00:00-05:00,5870.620117,2024-11-15 00:00:00-05:00,6001.350098
653,2024-11-26 00:00:00-05:00,2024-11-29 00:00:00-05:00,5998.740234,2024-11-27 00:00:00-05:00,6021.629883
654,2024-12-04 00:00:00-05:00,2024-12-06 00:00:00-05:00,6075.109863,2024-12-05 00:00:00-05:00,6086.490234
655,2024-12-06 00:00:00-05:00,2025-01-23 00:00:00-05:00,5827.040039,2025-01-10 00:00:00-05:00,6090.27002
656,2025-01-23 00:00:00-05:00,2025-02-18 00:00:00-05:00,5994.569824,2025-02-03 00:00:00-05:00,6118.709961


In [61]:
# Calculate drawdown percentages: (high - low) / high × 100
min_between_highs["drawdown_pct"] = ((min_between_highs["start_price"] - min_between_highs["min_price"]) / 
                                     min_between_highs["start_price"]) * 100

In [62]:
# Filter for corrections with at least 5% drawdown
min_between_highs = min_between_highs[min_between_highs['drawdown_pct'] >= 5]

In [63]:
# Calculate the duration in days for each correction period
min_between_highs['days'] = (min_between_highs["min_price_date"] - min_between_highs["start_high_date"]).dt.days

In [71]:
min_between_highs.sort_values(by='drawdown_pct', ascending=False).head(10)

Unnamed: 0,start_high_date,next_high_date,min_price,min_price_date,start_price,drawdown_pct,days
454,2007-10-09 00:00:00-04:00,2013-03-28 00:00:00-04:00,676.530029,2009-03-09 00:00:00-04:00,1565.150024,56.775388,517
449,2000-03-24 00:00:00-05:00,2007-05-30 00:00:00-04:00,776.76001,2002-10-09 00:00:00-04:00,1527.459961,49.146948,928
212,1973-01-11 00:00:00-05:00,1980-07-17 00:00:00-04:00,62.279999,1974-10-03 00:00:00-04:00,120.239998,48.203593,629
199,1968-11-29 00:00:00-05:00,1972-03-06 00:00:00-05:00,69.290001,1970-05-26 00:00:00-04:00,108.370003,36.061641,542
580,2020-02-19 00:00:00-05:00,2020-08-18 00:00:00-04:00,2237.399902,2020-03-23 00:00:00-04:00,3386.149902,33.92496,32
298,1987-08-25 00:00:00-04:00,1989-07-26 00:00:00-04:00,223.919998,1987-12-04 00:00:00-05:00,336.769989,33.509515,101
137,1961-12-12 00:00:00-05:00,1963-09-03 00:00:00-04:00,52.32,1962-06-26 00:00:00-04:00,72.639999,27.973568,195
225,1980-11-28 00:00:00-05:00,1982-11-03 00:00:00-05:00,102.419998,1982-08-12 00:00:00-04:00,140.520004,27.113582,621
626,2022-01-03 00:00:00-05:00,2024-01-19 00:00:00-05:00,3577.030029,2022-10-12 00:00:00-04:00,4796.560059,25.425097,281
182,1966-02-09 00:00:00-05:00,1967-05-04 00:00:00-04:00,73.199997,1966-10-07 00:00:00-04:00,94.059998,22.177335,239


In [66]:
# Determine the 25th, 50th (median), and 75th percentiles for correction durations
min_between_highs['days'].describe()

count     71.000000
mean     112.929577
std      178.898960
min        7.000000
25%       21.500000
50%       39.000000
75%       89.000000
max      928.000000
Name: days, dtype: float64

### Question 4. [Stocks] Earnings Surprise Analysis for Amazon (AMZN)
Calculate the median 2-day percentage change in stock prices following positive earnings surprises days.

Steps:

* Load earnings data from CSV (ha1_Amazon.csv) containing earnings dates, EPS estimates, and actual EPS. pandas.read_csv("ha1_Amazon.csv", delimiter=';') 
* Download complete historical price data using yfinance
* Calculate 2-day percentage changes for all historical dates: for each sequence of 3 consecutive trading days (Day 1, Day 2, Day 3), compute the return as Close_Day3 / Close_Day1 - 1. (Assume Day 2 may correspond to the earnings announcement.)
* Identify positive earnings surprises (where "actual EPS > estimated EPS"). Both fields should be present in the file. You should obtain 36 data points for use in the descriptive analysis (median) later.
* Calculate 2-day percentage changes following positive earnings surprises. Show your answer in % (closest number to the 2nd digit): return * 100.0
* (Optional) Compare the median 2-day percentage change for positive surprises vs. all historical dates. Do you see the difference?

Context: Earnings announcements, especially when they exceed analyst expectations, can significantly impact stock prices in the short term.

Reference: Yahoo Finance earnings calendar - https://finance.yahoo.com/calendar/earnings?symbol=AMZN

Additional: Is there a correlation between the magnitude of the earnings surprise and the stock price reaction? Does the market react differently to earnings surprises during bull vs. bear markets?)

In [3]:
df = pd.read_csv("ha1_Amazon.csv", delimiter=';')
df['Reported EPS'] = pd.to_numeric(df['Reported EPS'], errors='coerce')
df['EPS Estimate'] = pd.to_numeric(df['EPS Estimate'], errors='coerce')
df['Earnings Date'] = pd.to_datetime(df['Earnings Date'].str.replace(r' [A-Z]{3,4}$', '', regex=True))
print('min date in data:', df['Earnings Date'].min())

min date in data: 1997-07-10 00:00:00


In [4]:
# Calculate 2-day percentage changes for all historical dates: for each sequence of 3 consecutive trading days (Day 1, Day 2, Day 3), 
# compute the return as Close_Day3 / Close_Day1 - 1
ticker_obj = yf.Ticker("AMZN")
df_hist = ticker_obj.history(start="1997-07-01")
df_hist['2d_change'] = (df_hist['Close'] / df_hist['Close'].shift(2)) - 1
df_hist.reset_index(inplace=True)
df_hist['date'] = df_hist['Date'].dt.date

In [5]:
df_hist.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,2d_change,date
7027,2025-06-06 00:00:00-04:00,212.399994,213.869995,210.5,213.570007,39832500,0.0,0.0,0.030594,2025-06-06
7028,2025-06-09 00:00:00-04:00,214.75,217.850006,212.880005,216.979996,38102500,0.0,0.0,0.043625,2025-06-09
7029,2025-06-10 00:00:00-04:00,216.779999,217.690002,214.149994,217.610001,31303300,0.0,0.0,0.018916,2025-06-10
7030,2025-06-11 00:00:00-04:00,217.410004,218.399994,212.889999,213.199997,39218800,0.0,0.0,-0.017421,2025-06-11
7031,2025-06-12 00:00:00-04:00,211.75,213.580002,211.330002,212.690002,10472180,0.0,0.0,-0.022609,2025-06-12


In [6]:
# Identify positive earnings surprises (where "actual EPS > estimated EPS")
pos_sur = df[(df['Reported EPS'] > df['EPS Estimate'])]
pos_sur['date'] = pos_sur['Earnings Date'].dt.date
pos_sur.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_sur['date'] = pos_sur['Earnings Date'].dt.date


(33, 7)

In [7]:
df_hist['date_prev'] = df_hist['date'].shift(1)
pos_sur_comb = pos_sur.merge(df_hist, how='left', left_on=['date'], right_on=['date_prev'])

In [8]:
pos_sur_comb.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%),date_x,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,2d_change,date_y,date_prev
0,AMZN,"Amazon.com, Inc.",2024-04-30 16:00:00,0.83,0.98,17.91,2024-04-30,2024-05-01 00:00:00-04:00,181.639999,185.149994,176.559998,179.0,94645100,0.0,0.0,-0.010831,2024-05-01,2024-04-30
1,AMZN,"Amazon.com, Inc.",2024-02-01 16:00:00,0.8,1.0,24.55,2024-02-01,2024-02-02 00:00:00-05:00,169.190002,172.5,167.330002,171.809998,117154900,0.0,0.0,0.107023,2024-02-02,2024-02-01
2,AMZN,"Amazon.com, Inc.",2023-10-26 16:00:00,0.58,0.94,60.85,2023-10-26,2023-10-27 00:00:00-04:00,126.199997,130.020004,125.519997,127.739998,125309300,0.0,0.0,0.052311,2023-10-27,2023-10-26
3,AMZN,"Amazon.com, Inc.",2023-08-03 16:00:00,0.35,0.65,85.73,2023-08-03,2023-08-04 00:00:00-04:00,141.059998,143.630005,139.320007,139.570007,152938700,0.0,0.0,0.088605,2023-08-04,2023-08-03
4,AMZN,"Amazon.com, Inc.",2023-04-27 16:00:00,0.21,0.31,46.36,2023-04-27,2023-04-28 00:00:00-04:00,107.730003,109.480003,104.330002,105.449997,130565000,0.0,0.0,0.004477,2023-04-28,2023-04-27


In [9]:
pos_sur_comb['2d_change'].median() * 100

np.float64(2.5702644458225343)

In [10]:
df_hist['2d_change'].median() * 100

np.float64(0.16685546345690527)

In [11]:
# Is there a correlation between the magnitude of the earnings surprise and the stock price reaction? 
# Does the market react differently to earnings surprises during bull vs. bear markets?
pos_sur_comb['surprise_magnitude'] = (
    pos_sur_comb['Reported EPS'] - pos_sur_comb['EPS Estimate']
) / pos_sur_comb['EPS Estimate']

In [15]:
np.corrcoef(pos_sur_comb['surprise_magnitude'], pos_sur_comb['2d_change'])[0][1]

np.float64(-0.022931207670493947)