# 논문 구현
[(Petal, 2023) The Santa Claus Rally in U.S. Stock Market Returns](https://openurl.ebsco.com/EPDB%3Agcd%3A15%3A17783471/detailv2?sid=ebsco%3Aplink%3Ascholar&id=ebsco%3Agcd%3A163860960&crl=c)

In [34]:
import warnings
warnings.filterwarnings('ignore')

## 데이터셋

### 데이터 수집
- daily prices
- dates : **1999.12.31** ~ 2021.12.31
- symbols : S&P500, NASDAQ
- from : yahoo finance website

In [35]:
date_start = '1999-12-31'
date_end = '2021-12-31'

symbols = {
    'S&P500' : '^GSPC', 
    'NASDAQ' : '^IXIC'
}

In [36]:
# # 야후파이낸스 설치
# !pip install yfinance

import yfinance as yf

In [37]:
df_prices = {
    name : yf.download(symbol, start=date_start, end=date_end)
    for name, symbol in symbols.items()
}

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


### 데이터 전처리
- daily stock returns 계산
  - 5536 observations
  - dates : **2000.01.31** ~ 2021.12.31
  - stnadard formula
    $$ dailyStockReturns(\%) = \frac{(closing Stock Price_t - closing Stock Price_{t-1})}{closing Stock Price_{t-1}} \times 100$$

In [38]:
col_price = 'Close'
col_rtn = 'Daily Returns'
for symbol in df_prices.keys():
    # 불필요한 컬럼 제거
    df_prices[symbol] = df_prices[symbol][[col_price]]

    df_prices[symbol][col_rtn] = df_prices[symbol][col_price].pct_change().fillna(0) * 100

    

### 데이터 정보(요약통계량)

#### Table 1
> SUMMARY STATISTICS OF DAILY PERCENTAGE RETURNS OF THE TWO U.S. STOCK
INDICES (JANUARY 1, 2000 TO DECEMBER 31, 2021)

In [39]:
import pandas as pd

In [40]:
index_names = ['Mean', 'Standard Deviation', 'Minimum', 'Maximum', 'Range', 'Observations']
table1 = pd.DataFrame(index = index_names)

for symbol in df_prices.keys():
    rtns = df_prices[symbol][col_rtn]
    stats = [
        rtns.mean(),
        rtns.std(),
        min(rtns),
        max(rtns),
        max(rtns) - min(rtns),
        len(rtns)
    ]
    table1[symbol] = stats

In [41]:
# 소수점 4자리까지 출력
pd.options.display.float_format = '{:.4f}'.format

table1

Unnamed: 0,S&P500,NASDAQ
Mean,0.029,0.037
Standard Deviation,1.2369,1.5826
Minimum,-11.9841,-12.3213
Maximum,11.58,14.1732
Range,23.5641,26.4945
Observations,5536.0,5536.0


## EMPIRICAL RESULTS

### 산타랠리 기간과 그 외 기간의 일 평균 수익률 비교

- Santa Claus Rally Days
  - the last five trading days in December and the first two trading days in January of the next year
  - 154 days (= 7 days X 22 years)
- Remaning Days
  - 5534 days (= 5536 days - 154 days )

In [42]:
col_period = 'period'
for symbol in df_prices.keys():
    df_prices[symbol][col_period] = 'Remaining Days'
    for year in range(2000, 2022):
        # 12월 마지막 5일
        df_prices[symbol].loc[str(year)+'-12'].iloc[-5:, -1] = 'Santa Claus Rally Days'
        # 1월 첫 2일
        df_prices[symbol].loc[str(year)+'-01'].iloc[:2, -1] = 'Santa Claus Rally Days'

In [43]:
df_prices['S&P500']['period'].value_counts()

period
Remaining Days            5382
Santa Claus Rally Days     154
Name: count, dtype: int64

In [44]:
df_prices['NASDAQ']['period'].value_counts()

period
Remaining Days            5382
Santa Claus Rally Days     154
Name: count, dtype: int64

#### Table 2 : T-Test & Mann-Whitney U Test
> DAILY MEAN PERCENTAGE RETURNS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS (JANUARY 1, 2000 TO DECEMBER 31, 2021)

In [None]:
from scipy.stats import ttest_ind, mannwhitneyu

In [None]:
table2 = pd.DataFrame(
    columns = df_prices.keys(),
    index = [
        ('Santa Claus Rally Days', 'Mean'),
        ('Santa Claus Rally Days', 'Observations'),
        ('Remaining Days', 'Mean'),
        ('Remaining Days', 'Observations'),
        '',
        'Significance Test Statistics',
        'T-test',
        ('T-test', 'T-test Value'),
        ('T-test', 'Significance'),
        ('T-test', 'Degree of Freedom'),
        '',
        'Mann-Whitney U Test',
        ('Mann-Whitney', 'Z'),
        ('Mann-Whitney', 'Significance')
    ]
).fillna('')

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]
    rtns_santa = df[df[col_period]=='Santa Claus Rally Days'][col_rtn]
    rtns_remaining = df[df[col_period]!='Santa Claus Rally Days'][col_rtn]

    # Summary Statistics
    table2.iloc[0][symbol] = rtns_santa.mean()
    table2.iloc[1][symbol] = len(rtns_santa)
    table2.iloc[2][symbol] = rtns_remaining.mean()
    table2.iloc[3][symbol] = len(rtns_remaining)

    # T-Test
    res_ttest = ttest_ind(
        rtns_santa, rtns_remaining, 
        equal_var=True, alternative='greater'
    )
    table2.iloc[7][symbol] = res_ttest.statistic
    table2.iloc[8][symbol] = res_ttest.pvalue
    table2.iloc[9][symbol] = int(res_ttest.df)

    # Mann-Whitney U Test
    res_utest = mannwhitneyu(
        rtns_santa, rtns_remaining,
        alternative='greater'
    )
    table2.iloc[12][symbol] = res_utest.statistic
    table2.iloc[13][symbol] = res_utest.pvalue

In [None]:
# 소수점 4자리까지 출력
pd.options.display.float_format = '{:.4f}'.format

table2

#### Table 3 : OLS Regression
> OLS REGRESSIONS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS (JANUARY 1, 2000 TO DECEMBER 31, 2021)

$$RETURN = \beta_0 + \beta_1 SantaClausRally + \varepsilon_t$$

- $RETURN$ : daily percentage returns of each of the two U.S. stock indices
- $SantaClausRally$ : a dummy variable
  - if Santa Claus Rally days, $SantaClausRally = 1$
  - else (Remaining days), $SantaClausRally = 0$

- estimated coefficient
  - $\beta_1$ : difference between the mean daily percentage returns of Santa Claus Rally days over the mean daily percentage returns of the remaining days of the year
    - ($\beta_1$ > 0) and (statistically significant) → "Santa Claus Rally is prevalent in U.S. stock returns"
  - $\beta_0$ : difference between the mean daily percentage returns of Santa Claus Rally days over the mean daily percentage returns of the remaining days of the year

In [None]:
import statsmodels.api as sm
import numpy as np

In [None]:
table3 = pd.DataFrame(
    index = [
        'S&P500',
        'S&P500(T-statistics)',
        'S&P500(p-values)',
        'NASDAQ',
        'NASDAQ(T-statistics)',
        'NASDAQ(p-values)'
    ],
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
).fillna('')

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]

    y = df[col_rtn]
    x = np.where(df[col_period]=='Santa Claus Rally Days', 1, 0)
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    table3.loc[symbol] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
    table3.loc[symbol+'(T-statistics)'].iloc[:2] = results.tvalues.values.tolist()
    table3.loc[symbol+'(p-values)'].iloc[:2] = results.pvalues.values.tolist()

In [None]:
# 소수점 3자리까지 출력
pd.options.display.float_format = '{:.3f}'.format

table3

### 기간별 산타랠리 기간과 그 외 기간의 일 평균 수익률 비교

- Panel A : 2000.01.01 ~ 2009.12.31
  - 2008~2009 : U.S. stock market had negative returns
- Panel B : 2010.01.01 ~ 2021.12.31

In [None]:
col_period_sub = 'panel'
for symbol in df_prices.keys():
    df_prices[symbol][col_period_sub] = 'B'
    df_prices[symbol].loc['2000':'2009', col_period_sub] = 'A'
    # df_prices[symbol].loc['2010':'2021', col_period_sub] = 'B'

#### Table 4
> DAILY MEAN PERCENTAGE RETURNS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS BY TWO CALENDAR SUB-PERIODS

In [None]:
table4 = pd.DataFrame(
    columns = df_prices.keys(),
    index = [
        'Panel A',
        'Santa Claus Rally Days (Mean)',
        'Santa Claus Rally Days (Observations)',
        'Remaining Days (Mean)',
        'Remaining Days (Observations)',
        'Total (Mean)',
        'Total (Observations)',

        'Panel B',
        'Santa Claus Rally Days (Mean)',
        'Santa Claus Rally Days (Observations)',
        'Remaining Days (Mean)',
        'Remaining Days (Observations)',
        'Total (Mean)',
        'Total (Observations)',
    ]
)

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]
    
    df_A = df[df[col_period_sub]=='A']
    rtns_A_total = df_A[col_rtn]
    rtns_A_santa = df_A[df_A[col_period]=='Santa Claus Rally Days'][col_rtn]
    rtns_A_remaining = df_A[df_A[col_period]!='Santa Claus Rally Days'][col_rtn]

    df_B = df[df[col_period_sub]=='B']
    rtns_B_total = df_B[col_rtn]
    rtns_B_santa = df_B[df_B[col_period]=='Santa Claus Rally Days'][col_rtn]
    rtns_B_remaining = df_B[df_B[col_period]!='Santa Claus Rally Days'][col_rtn]

    table4[symbol] = [
        '', rtns_A_santa.mean(), len(rtns_A_santa),
        rtns_A_remaining.mean(), len(rtns_A_remaining), rtns_A_total.mean(), len(rtns_A_total),
        '', rtns_B_santa.mean(), len(rtns_B_santa),
        rtns_B_remaining.mean(), len(rtns_B_remaining), rtns_B_total.mean(), len(rtns_B_total)
    ]

In [None]:
# 소수점 4자리까지 출력
pd.options.display.float_format = '{:.4f}'.format

table4

#### Table 5 : OLS Regression

> OLS REGRESSIONS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS BY TWO SUB-PERIODS

In [None]:
table5 = pd.DataFrame(
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
)

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]

    # Panel A: January 1, 2000, to December 31, 2009
    df_A = df[df[col_period_sub]=='A']

    y = df_A[col_rtn]
    x = np.where(df_A[col_period]=='Santa Claus Rally Days', 1, 0)
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    res_A = pd.DataFrame(
        columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
        ],
        index = [
            'Panel A', symbol, symbol+'(T-statistics)', symbol+'(p-values)'
        ]
    ).fillna('')
    res_A.loc[symbol] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
    res_A.loc[symbol+'(T-statistics)'].iloc[:2] = results.tvalues.values.tolist()
    res_A.loc[symbol+'(p-values)'].iloc[:2] = results.pvalues.values.tolist()

    # Panel B: January 1, 2010, to December 31, 2021
    df_B = df[df[col_period_sub]=='B']

    y = df_B[col_rtn]
    x = np.where(df_B[col_period]=='Santa Claus Rally Days', 1, 0)
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    res_B = pd.DataFrame(
        columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
        ],
        index = [
            'Panel B', symbol, symbol+'(T-statistics)', symbol+'(p-values)'
        ]
    ).fillna('')
    res_B.loc[symbol] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
    res_B.loc[symbol+'(T-statistics)'].iloc[:2] = results.tvalues.values.tolist()
    res_B.loc[symbol+'(p-values)'].iloc[:2] = results.pvalues.values.tolist()

    # concatnate
    table5 = pd.concat([table5, res_A, res_B], axis=0)

In [None]:
# 소수점 3자리까지 출력
pd.options.display.float_format = '{:.3f}'.format

table5 = table5.iloc[[0, 1, 2, 3, 9, 10, 11, 4, 5, 6, 7, 13, 14, 15]]
table5

### 경기별 산타랠리 기간과 그 외 기간의 일 평균 수익률 비교

- Panel A : Expansion Periods
  1. January 2000 ~ March 2001
  2. December 2001 ~ December 2007
  3. July 2009 ~ February 2020
  4. May 2020 ~ December 2021
- Panel B : Recession Periods
  1. Apirl 2001 ~ November 2001
  2. January 2008 ~ June 2009
  3. March 2020 ~ April 2020

In [None]:
col_period_cycle = 'economic cycle'
for symbol in df_prices.keys():
    df_prices[symbol][col_period_cycle] = 'Expansion Periods'

    # df_prices[symbol].loc['2000-01':'2001-03', col_period_cycle] = 'Expansion Periods'
    df_prices[symbol].loc['2001-04':'2001-11', col_period_cycle] = 'Recession Periods'

    # df_prices[symbol].loc['2001-12':'2007-12', col_period_cycle] = 'Expansion Periods'
    df_prices[symbol].loc['2008-01':'2009-06', col_period_cycle] = 'Recession Periods'

    # df_prices[symbol].loc['2009-07':'2020-02', col_period_cycle] = 'Expansion Periods'
    df_prices[symbol].loc['2020-03':'2020-04', col_period_cycle] = 'Recession Periods'

    # df_prices[symbol].loc['2020-05':'2021-12', col_period_cycle] = 'Expansion Periods'

#### Table 6
> DAILY MEAN PERCENTAGE RETURNS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS BY ECONOMIC CYCLES

In [None]:
table6 = pd.DataFrame(
    columns = df_prices.keys(),
    index = [
        'Expansion Periods',
        'Santa Claus Rally Days (Mean)',
        'Santa Claus Rally Days (Observations)',
        'Remaining Days (Mean)',
        'Remaining Days (Observations)',
        'Total (Mean)',
        'Total (Observations)',

        'Recession Periods',
        'Santa Claus Rally Days (Mean)',
        'Santa Claus Rally Days (Observations)',
        'Remaining Days (Mean)',
        'Remaining Days (Observations)',
        'Total (Mean)',
        'Total (Observations)',
    ]
)

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]
    
    df_A = df[df[col_period_cycle]=='Expansion Periods']
    rtns_A_total = df_A[col_rtn]
    rtns_A_santa = df_A[df_A[col_period]=='Santa Claus Rally Days'][col_rtn]
    rtns_A_remaining = df_A[df_A[col_period]!='Santa Claus Rally Days'][col_rtn]

    df_B = df[df[col_period_cycle]=='Recession Periods']
    rtns_B_total = df_B[col_rtn]
    rtns_B_santa = df_B[df_B[col_period]=='Santa Claus Rally Days'][col_rtn]
    rtns_B_remaining = df_B[df_B[col_period]!='Santa Claus Rally Days'][col_rtn]

    table6[symbol] = [
        '', rtns_A_santa.mean(), len(rtns_A_santa),
        rtns_A_remaining.mean(), len(rtns_A_remaining), rtns_A_total.mean(), len(rtns_A_total),
        '', rtns_B_santa.mean(), len(rtns_B_santa),
        rtns_B_remaining.mean(), len(rtns_B_remaining), rtns_B_total.mean(), len(rtns_B_total)
    ]

In [None]:
# 소수점 4자리까지 출력
pd.options.display.float_format = '{:.4f}'.format

table6

#### Table 7 : OLS Regression

> OLS REGRESSIONS OF SANTA CLAUS RALLY DAYS AND REMAINING DAYS BY ECONOMIC CYCLES

In [None]:
table7 = pd.DataFrame(
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
)

In [None]:
for symbol in df_prices.keys():
    df = df_prices[symbol]

    # Panel A: Expansion Periods
    df_A = df[df[col_period_cycle]=='Expansion Periods']

    y = df_A[col_rtn]
    x = np.where(df_A[col_period]=='Santa Claus Rally Days', 1, 0)
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    res_A = pd.DataFrame(
        columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
        ],
        index = [
            'Expansion Periods', symbol, symbol+'(T-statistics)', symbol+'(p-values)'
        ]
    ).fillna('')
    res_A.loc[symbol] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
    res_A.loc[symbol+'(T-statistics)'].iloc[:2] = results.tvalues.values.tolist()
    res_A.loc[symbol+'(p-values)'].iloc[:2] = results.pvalues.values.tolist()

    # Panel B: Recession Periods
    df_B = df[df[col_period_cycle]=='Recession Periods']

    y = df_B[col_rtn]
    x = np.where(df_B[col_period]=='Santa Claus Rally Days', 1, 0)
    x = sm.add_constant(x)

    model = sm.OLS(y, x)
    results = model.fit()

    res_B = pd.DataFrame(
        columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
        ],
        index = [
            'Recession Periods', symbol, symbol+'(T-statistics)', symbol+'(p-values)'
        ]
    ).fillna('')
    res_B.loc[symbol] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
    res_B.loc[symbol+'(T-statistics)'].iloc[:2] = results.tvalues.values.tolist()
    res_B.loc[symbol+'(p-values)'].iloc[:2] = results.pvalues.values.tolist()

    # concatnate
    table7 = pd.concat([table7, res_A, res_B], axis=0)

In [None]:
# 소수점 3자리까지 출력
pd.options.display.float_format = '{:.3f}'.format

table7 = table7.iloc[[0, 1, 2, 3, 9, 10, 11, 4, 5, 6, 7, 13, 14, 15]]
table7