### 필요 라이브러리 호출

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import yfinance as yf

from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt
import platform
if platform.system() == 'Darwin': # MAC
    plt.rc('font', family='AppleGothic')
else: # Windows
    plt.rc('font', family='Malgun Gothic')

plt.rcParams['axes.unicode_minus'] = False

In [2]:
import utils
import preprocessing

In [62]:
kospi = preprocessing.get_kospi(drop=True)

kospi.pivot_table(
    index = '귀속년도',
    columns='event',
    values = 'rtn',
    aggfunc='count'
)

[*********************100%%**********************]  1 of 1 completed


event,after,before,else,santa
귀속년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,7.0,6.0,,7.0
1997,7.0,7.0,222.0,7.0
1998,7.0,7.0,225.0,7.0
1999,7.0,7.0,228.0,7.0
2000,7.0,7.0,220.0,7.0
2001,7.0,7.0,225.0,7.0
2002,7.0,7.0,223.0,7.0
2003,7.0,7.0,226.0,7.0
2004,7.0,7.0,228.0,7.0
2005,7.0,7.0,228.0,7.0


### 데이터 수집

In [24]:
# name = 'KOSPI'
# symbol = '^KS11'

name = 'ETF반도체'
symbol = '091160.KS'

# ohlcv 데이터 수집 (전체 기간)
# data = preprocessing.get_data(symbol, drop=True)
data = preprocessing.get_data(drop=True)
data

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-10-17,804.000000,-7.277128,else,2000.0
2000-10-18,802.599976,-0.174132,else,2000.0
2000-10-19,798.400024,-0.523293,else,2000.0
2000-10-20,826.200012,3.481962,else,2000.0
2000-10-23,796.400024,-3.606873,else,2000.0
...,...,...,...,...
2023-12-28,866.570007,0.788568,santa,2023.0
2024-01-02,878.929993,1.426311,santa,2023.0
2024-01-03,871.570007,-0.837380,santa,2023.0
2024-01-04,866.250000,-0.610394,after,2023.0


In [63]:
data = kospi

#### 데이터 기간 설정

In [64]:
# date_start = '2000-01-01'
# date_end = '2021-12-31'

# data21 = utils.filter_date(data, date_start, date_end)
# data21

In [65]:
data21 = utils.filter_range(data, '귀속년도', 2009, 2021)
data21

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,event,귀속년도,rtn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-15,1138.469971,1145.410034,1108.150024,1111.339966,1111.339966,366500,else,2009,-6.032070
2009-01-16,1120.689941,1135.790039,1109.689941,1135.199951,1135.199951,335700,else,2009,2.146956
2009-01-19,1146.089966,1159.880005,1141.310059,1150.650024,1150.650024,370300,else,2009,1.361000
2009-01-20,1136.160034,1136.479980,1115.069946,1126.810059,1126.810059,319500,else,2009,-2.071869
2009-01-21,1087.430054,1118.400024,1085.719971,1103.609985,1103.609985,334600,else,2009,-2.058916
...,...,...,...,...,...,...,...,...,...
2022-01-10,2947.370117,2951.120117,2910.899902,2926.719971,2926.719971,477000,after,2021,-0.953332
2022-01-11,2930.929932,2943.570068,2909.820068,2927.379883,2927.379883,565800,after,2021,0.022548
2022-01-12,2950.780029,2972.580078,2950.310059,2972.479980,2972.479980,519000,after,2021,1.540630
2022-01-13,2979.909912,2982.139893,2958.389893,2962.090088,2962.090088,604600,after,2021,-0.349536


In [66]:
data21.pivot_table(
    index = '귀속년도',
    columns='event',
    values = 'rtn',
    aggfunc='count'
)

event,after,before,else,santa
귀속년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009,7,7,232,7
2010,7,7,230,7
2011,7,7,226,7
2012,7,7,227,7
2013,7,7,226,7
2014,7,7,224,7
2015,7,7,227,7
2016,7,7,225,7
2017,7,7,220,7
2018,7,7,223,7


#### 최종 데이터 정보(요약통계량)

In [67]:
data21.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,event,귀속년도,rtn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-01-15,1138.469971,1145.410034,1108.150024,1111.339966,1111.339966,366500,else,2009,-6.03207
2009-01-16,1120.689941,1135.790039,1109.689941,1135.199951,1135.199951,335700,else,2009,2.146956
2009-01-19,1146.089966,1159.880005,1141.310059,1150.650024,1150.650024,370300,else,2009,1.361
2009-01-20,1136.160034,1136.47998,1115.069946,1126.810059,1126.810059,319500,else,2009,-2.071869
2009-01-21,1087.430054,1118.400024,1085.719971,1103.609985,1103.609985,334600,else,2009,-2.058916


In [68]:
data21.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,event,귀속년도,rtn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-10,2947.370117,2951.120117,2910.899902,2926.719971,2926.719971,477000,after,2021,-0.953332
2022-01-11,2930.929932,2943.570068,2909.820068,2927.379883,2927.379883,565800,after,2021,0.022548
2022-01-12,2950.780029,2972.580078,2950.310059,2972.47998,2972.47998,519000,after,2021,1.54063
2022-01-13,2979.909912,2982.139893,2958.389893,2962.090088,2962.090088,604600,after,2021,-0.349536
2022-01-14,2937.610107,2944.969971,2914.72998,2921.919922,2921.919922,532300,after,2021,-1.356143


In [69]:
data21[['rtn']].describe()

Unnamed: 0,rtn
count,3212.0
mean,0.034049
std,1.084211
min,-8.393661
25%,-0.463427
50%,0.05217
75%,0.60152
max,8.601245


## 산타랠리 비교

In [70]:
data21['event'].value_counts()

event
else      2939
before      91
santa       91
after       91
Name: count, dtype: int64

### 일 평균 수익률 비교

In [71]:
data21.groupby('event')[['rtn']].agg(['mean', 'std', 'min', 'max', 'count'])

Unnamed: 0_level_0,rtn,rtn,rtn,rtn,rtn
Unnamed: 0_level_1,mean,std,min,max,count
event,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
after,0.113384,0.920822,-1.738225,3.974694,91
before,0.123992,0.847154,-3.425613,3.0869,91
else,0.028348,1.100967,-8.393661,8.601245,2939
santa,0.048905,0.888263,-2.195055,2.68511,91


### T-Test & Mann-Whitney U Test

In [93]:
df = data21

rtns_before = df[df['event']=='before']['rtn']
rtns_santa = df[df['event']=='santa']['rtn']
rtns_after = df[df['event']=='after']['rtn']

rtns_before.shape, rtns_santa.shape, rtns_after.shape

((91,), (91,), (91,))

In [94]:
group1 = rtns_before
group2 = rtns_santa

#### H1 : before < santa

In [107]:
import pandas as pd

res_test = pd.DataFrame(
    columns = ['T-Test', 'mann-Whitney U Test', 'wilcoxon'],
    index = ['Statistic', 'p-value']
)

**독립표본 t-test**에 해당하는 가정은 다음과 같습니다:

1. 정규성 (Normality): 
   - 각 그룹의 모집단이 정규분포를 따라야 합니다. 
   - 표본 크기가 큰 경우(30 이상) 중심극한정리에 의해 정규성 가정이 상대적으로 완화될 수 있습니다.
   - `shapiro` 검정
2. 등분산성 (Homogeneity of Variance)
   - 비교하는 두 그룹의 모집단 분산이 서로 동일해야 합니다.
   - `levene` 검정
3. 독립성 (Independence)
   - 표본 데이터는 서로 독립이어야 합니다.

이러한 가정들이 충족되어야 독립표본 t-test의 결과를 신뢰할 수 있습니다.

In [108]:
import scipy.stats as stats
from scipy.stats import pearsonr

# 독립성 검정 (상관분석)
correlation, p_value = pearsonr(group1, group2)
print("독립성 확인(상관분석)")
print(f"(1) Pearson 상관계수: {correlation} -> 0에 가까운 가?")
print(f"(2) P-value: {p_value} -> 0.05보다 큰가? {p_value > 0.05}")
print()

# 정규성 검정 (Shapiro-Wilk test)
stat1, p_value1 = stats.shapiro(group2)
stat2, p_value2 = stats.shapiro(group1)

print("정규성 검정")
print(f"(1) group1(Before) 정규성 검정: Statistic = {stat2}, p-value = {p_value2}")
print(f'-> 정규성 만족 여부 : {p_value2 > 0.05}')
print(f"(2) group2(Santa) 정규성 검정: Statistic = {stat1}, p-value = {p_value1}")
print(f'-> 정규성 만족 여부 : {p_value1 > 0.05}')
print()

# 등분산성 검정 (Levene's test)
stat, p_value = stats.levene(group1, group2)

print(f"등분산성 검정: Statistic = {stat}, p-value = {p_value}")
print(f'등분산성 만족 여부 : {p_value > 0.05}')

독립성 확인(상관분석)
(1) Pearson 상관계수: -0.020207604022441924 -> 0에 가까운 가?
(2) P-value: 0.8492128673572855 -> 0.05보다 큰가? True

정규성 검정
(1) group1(Before) 정규성 검정: Statistic = 0.9179143905639648, p-value = 2.5615267077228054e-05
-> 정규성 만족 여부 : False
(2) group2(Santa) 정규성 검정: Statistic = 0.9770123362541199, p-value = 0.10730132460594177
-> 정규성 만족 여부 : True

등분산성 검정: Statistic = 0.9457986276957543, p-value = 0.332096103196565
등분산성 만족 여부 : True


In [109]:
# T-Test
# H1 : mean(group1) < mean(group2(산타))
res_ttest = ttest_ind(
    group1, group2, 
    equal_var=True, alternative='greater'
)
t_statistic = res_ttest.statistic
t_pvalue = res_ttest.pvalue

res_test['T-Test'] = [t_statistic, t_pvalue]

In [110]:
# U-Test
# H1 : mean(group1) < mean(group2(산타))
group1 = rtns_before
group2 = rtns_santa

res_utest = mannwhitneyu(
    group1, group2,
    alternative='greater'
)
u_statistic = res_utest.statistic
u_pvalue = res_utest.pvalue

res_test['mann-Whitney U Test'] = [u_statistic, u_pvalue]

In [111]:
from scipy.stats import wilcoxon

# Wilcoxon Signed Rank Test
# H1 : group1 < group2(santa)
stat, p_value = wilcoxon(group1, group2, alternative='greater')

print(f"Wilcoxon Signed Rank 통계량: {stat}")
print(f"P-value: {p_value}")

res_test['wilcoxon'] = [stat, p_value]

Wilcoxon Signed Rank 통계량: 2221.0
P-value: 0.3062135584784446


In [112]:
# 검정 결과
#  H1 : before < santa

res_test

Unnamed: 0,T-Test,mann-Whitney U Test,wilcoxon
Statistic,0.583549,4355.0,2221.0
p-value,0.280127,0.273522,0.306214


### OLS Regression

In [113]:
res_ols = pd.DataFrame(
    index = [
        name, 'T-statistics', 'p-values'
    ],
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
).fillna('')

In [114]:
_df = df[(df['event']=='santa') | (df['event']=='before')]
_df['event'].value_counts()

event
before    91
santa     91
Name: count, dtype: int64

In [115]:
y = _df['rtn']
x = np.where(_df['event']=='santa', 1, 0)
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

res_ols.loc[name] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
res_ols.loc['T-statistics'].iloc[:2] = results.tvalues.values.tolist()
res_ols.loc['p-values'].iloc[:2] = results.pvalues.values.tolist()

In [116]:
results.summary()

0,1,2,3
Dep. Variable:,rtn,R-squared:,0.002
Model:,OLS,Adj. R-squared:,-0.004
Method:,Least Squares,F-statistic:,0.3405
Date:,"Sat, 06 Jan 2024",Prob (F-statistic):,0.56
Time:,19:57:38,Log-Likelihood:,-231.47
No. Observations:,182,AIC:,466.9
Df Residuals:,180,BIC:,473.3
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1240,0.091,1.363,0.175,-0.056,0.304
x1,-0.0751,0.129,-0.584,0.560,-0.329,0.179

0,1,2,3
Omnibus:,15.004,Durbin-Watson:,1.902
Prob(Omnibus):,0.001,Jarque-Bera (JB):,43.082
Skew:,-0.161,Prob(JB):,4.41e-10
Kurtosis:,5.362,Cond. No.,2.62


In [117]:
res_ols

Unnamed: 0,Constant,Santa Rally Days,F-Value,Significance
ETF반도체,0.123992,-0.075087,0.34053,0.560254
T-statistics,1.362762,-0.583549,,
p-values,0.17466,0.560254,,
