### 필요 라이브러리 호출

In [4]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import yfinance as yf

from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt
import platform
if platform.system() == 'Darwin': # MAC
    plt.rc('font', family='AppleGothic')
else: # Windows
    plt.rc('font', family='Malgun Gothic')

plt.rcParams['axes.unicode_minus'] = False

In [38]:
quant = ['peg', 'ncav', 'low']

# # PEG - 유의하지 않음
# name = quant[0]
# fname = '../datasets/rtns_' + quant[0] + '.csv' # peg

# NCAV - 유의하지 않음
name = quant[1]
fname = '../datasets/rtns_' + quant[1] + '.csv' # ncav

df_rtns = pd.read_csv(fname, index_col=0)
df_rtns.index = pd.to_datetime(df_rtns.index)

df_rtns['rtn_portfolio'] *= 100

# 컬럼명 변경 : rtn_portfolio -> rtn
df_rtns.columns = ['rtn', '귀속년도', 'event', 'rtn_market']
# df_rtns.columns = ['rtn_portfolio', '귀속년도', 'event', 'rtn_market']
df_rtns

Unnamed: 0,rtn,귀속년도,event,rtn_market
2008-12-12,-1.030230,2008,before,-4.383991
2008-12-15,4.247748,2008,before,4.925622
2008-12-16,0.929976,2008,before,0.290981
2008-12-17,-0.699803,2008,before,0.705081
2008-12-18,0.707293,2008,before,0.526611
...,...,...,...,...
2023-12-26,-0.051410,2023,santa,0.118487
2023-12-27,-1.330641,2023,santa,0.419194
2023-12-28,0.698222,2023,santa,1.598624
2024-01-02,-0.378831,2023,santa,0.547213


### 데이터 수집

#### 데이터 기간 설정

In [39]:
# date_start = '2000-01-01'
# date_end = '2021-12-31'

In [40]:
data21 = df_rtns[df_rtns['귀속년도']<=2021]
data21

Unnamed: 0,rtn,귀속년도,event,rtn_market
2008-12-12,-1.030230,2008,before,-4.383991
2008-12-15,4.247748,2008,before,4.925622
2008-12-16,0.929976,2008,before,0.290981
2008-12-17,-0.699803,2008,before,0.705081
2008-12-18,0.707293,2008,before,0.526611
...,...,...,...,...
2021-12-28,0.515667,2021,santa,0.689768
2021-12-29,1.570537,2021,santa,-0.892312
2021-12-30,-0.305106,2021,santa,-0.522507
2022-01-04,0.230044,2021,santa,0.389236


In [41]:
data21.pivot_table(
    index = '귀속년도',
    columns='event',
    values = 'rtn',
    aggfunc='count'
)

event,before,santa
귀속년도,Unnamed: 1_level_1,Unnamed: 2_level_1
2008,7,7
2009,7,7
2010,7,7
2011,7,7
2012,7,7
2013,7,7
2014,7,7
2015,7,7
2016,7,7
2017,7,7


#### 최종 데이터 정보(요약통계량)

In [42]:
data21.head()

Unnamed: 0,rtn,귀속년도,event,rtn_market
2008-12-12,-1.03023,2008,before,-4.383991
2008-12-15,4.247748,2008,before,4.925622
2008-12-16,0.929976,2008,before,0.290981
2008-12-17,-0.699803,2008,before,0.705081
2008-12-18,0.707293,2008,before,0.526611


In [43]:
data21.tail()

Unnamed: 0,rtn,귀속년도,event,rtn_market
2021-12-28,0.515667,2021,santa,0.689768
2021-12-29,1.570537,2021,santa,-0.892312
2021-12-30,-0.305106,2021,santa,-0.522507
2022-01-04,0.230044,2021,santa,0.389236
2022-01-05,-0.057551,2021,santa,-1.179899


In [44]:
data21[['rtn']].describe()

Unnamed: 0,rtn
count,196.0
mean,0.192806
std,1.013357
min,-4.11854
25%,-0.250606
50%,0.198108
75%,0.61161
max,4.627848


## 산타랠리 비교

In [45]:
data21['event'].value_counts()

before    98
santa     98
Name: event, dtype: int64

### 일 평균 수익률 비교

In [46]:
data21.groupby('event')[['rtn']].agg(['mean', 'std', 'min', 'max', 'count'])

Unnamed: 0_level_0,rtn,rtn,rtn,rtn,rtn
Unnamed: 0_level_1,mean,std,min,max,count
event,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
before,0.074192,0.940014,-2.504461,4.247748,98
santa,0.31142,1.073461,-4.11854,4.627848,98


### T-Test & Mann-Whitney U Test

In [47]:
df = data21

rtns_before = df[df['event']=='before']['rtn']
rtns_santa = df[df['event']=='santa']['rtn']

rtns_before.shape, rtns_santa.shape

((98,), (98,))

In [48]:
group1 = rtns_before
group2 = rtns_santa

#### H1 : before < santa

In [49]:
import pandas as pd

res_test = pd.DataFrame(
    columns = ['T-Test', 'mann-Whitney U Test', 'wilcoxon'],
    index = ['Statistic', 'p-value']
)

**독립표본 t-test**에 해당하는 가정은 다음과 같습니다:

1. 정규성 (Normality): 
   - 각 그룹의 모집단이 정규분포를 따라야 합니다. 
   - 표본 크기가 큰 경우(30 이상) 중심극한정리에 의해 정규성 가정이 상대적으로 완화될 수 있습니다.
   - `shapiro` 검정
2. 등분산성 (Homogeneity of Variance)
   - 비교하는 두 그룹의 모집단 분산이 서로 동일해야 합니다.
   - `levene` 검정
3. 독립성 (Independence)
   - 표본 데이터는 서로 독립이어야 합니다.

이러한 가정들이 충족되어야 독립표본 t-test의 결과를 신뢰할 수 있습니다.

In [50]:
import scipy.stats as stats
from scipy.stats import pearsonr

# 독립성 검정 (상관분석)
correlation, p_value = pearsonr(group1, group2)
print("독립성 확인(상관분석)")
print(f"(1) Pearson 상관계수: {correlation} -> 0에 가까운 가?")
print(f"(2) P-value: {p_value} -> 0.05보다 큰가? {p_value > 0.05}")
print()

# 정규성 검정 (Shapiro-Wilk test)
stat1, p_value1 = stats.shapiro(group2)
stat2, p_value2 = stats.shapiro(group1)

print("정규성 검정")
print(f"(1) group1(Before) 정규성 검정: Statistic = {stat2}, p-value = {p_value2}")
print(f'-> 정규성 만족 여부 : {p_value2 > 0.05}')
print(f"(2) group2(Santa) 정규성 검정: Statistic = {stat1}, p-value = {p_value1}")
print(f'-> 정규성 만족 여부 : {p_value1 > 0.05}')
print()

# 등분산성 검정 (Levene's test)
stat, p_value = stats.levene(group1, group2)

print(f"등분산성 검정: Statistic = {stat}, p-value = {p_value}")
print(f'등분산성 만족 여부 : {p_value > 0.05}')

독립성 확인(상관분석)
(1) Pearson 상관계수: -0.12474463464803955 -> 0에 가까운 가?
(2) P-value: 0.22100836412062982 -> 0.05보다 큰가? True

정규성 검정
(1) group1(Before) 정규성 검정: Statistic = 0.9456673860549927, p-value = 0.0005033783963881433
-> 정규성 만족 여부 : False
(2) group2(Santa) 정규성 검정: Statistic = 0.8828650712966919, p-value = 3.0049000088183675e-07
-> 정규성 만족 여부 : False

등분산성 검정: Statistic = 0.0029693989057992267, p-value = 0.9565991053467661
등분산성 만족 여부 : True


In [51]:
# T-Test
# H1 : mean(group1) < mean(group2(산타))
res_ttest = ttest_ind(
    group1, group2, 
    equal_var=True, alternative='greater'
)
t_statistic = res_ttest.statistic
t_pvalue = res_ttest.pvalue

res_test['T-Test'] = [t_statistic, t_pvalue]

In [52]:
# U-Test
# H1 : mean(group1) < mean(group2(산타))
group1 = rtns_before
group2 = rtns_santa

res_utest = mannwhitneyu(
    group1, group2,
    alternative='greater'
)
u_statistic = res_utest.statistic
u_pvalue = res_utest.pvalue

res_test['mann-Whitney U Test'] = [u_statistic, u_pvalue]

In [53]:
from scipy.stats import wilcoxon

# Wilcoxon Signed Rank Test
# H1 : group1 < group2(santa)
stat, p_value = wilcoxon(group1, group2, alternative='greater')

print(f"Wilcoxon Signed Rank 통계량: {stat}")
print(f"P-value: {p_value}")

res_test['wilcoxon'] = [stat, p_value]

Wilcoxon Signed Rank 통계량: 1791.0
P-value: 0.9877246396130841


In [54]:
# 검정 결과
#  H1 : before < santa

res_test

Unnamed: 0,T-Test,mann-Whitney U Test,wilcoxon
Statistic,-1.645873,3890.0,1791.0
p-value,0.949295,0.989221,0.987725


### OLS Regression

In [55]:
res_ols = pd.DataFrame(
    index = [
        name, 'T-statistics', 'p-values'
    ],
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
).fillna('')

In [56]:
_df = df[(df['event']=='santa') | (df['event']=='before')]
_df['event'].value_counts()

before    98
santa     98
Name: event, dtype: int64

In [57]:
y = _df['rtn']
x = np.where(_df['event']=='santa', 1, 0)
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

res_ols.loc[name] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
res_ols.loc['T-statistics'].iloc[:2] = results.tvalues.values.tolist()
res_ols.loc['p-values'].iloc[:2] = results.pvalues.values.tolist()

In [58]:
results.summary()

0,1,2,3
Dep. Variable:,rtn,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,2.709
Date:,"Tue, 09 Jan 2024",Prob (F-statistic):,0.101
Time:,01:21:54,Log-Likelihood:,-278.85
No. Observations:,196,AIC:,561.7
Df Residuals:,194,BIC:,568.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0742,0.102,0.728,0.468,-0.127,0.275
x1,0.2372,0.144,1.646,0.101,-0.047,0.522

0,1,2,3
Omnibus:,32.604,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,198.06
Skew:,0.338,Prob(JB):,9.81e-44
Kurtosis:,7.878,Cond. No.,2.62


In [59]:
res_ols

Unnamed: 0,Constant,Santa Rally Days,F-Value,Significance
ncav,0.074192,0.237228,2.708896,0.101409
T-statistics,0.727951,1.645873,,
p-values,0.467521,0.101409,,
