### 필요 라이브러리 호출

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import yfinance as yf

from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt
import platform
if platform.system() == 'Darwin': # MAC
    plt.rc('font', family='AppleGothic')
else: # Windows
    plt.rc('font', family='Malgun Gothic')

plt.rcParams['axes.unicode_minus'] = False

In [2]:
import utils
import preprocessing

In [3]:
kospi = preprocessing.get_kospi(drop=True)

kospi.pivot_table(
    index = '귀속년도',
    columns='event',
    values = 'rtn',
    aggfunc='count'
)

[*********************100%%**********************]  1 of 1 completed


event,after,before,else,santa
귀속년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1996,7.0,6.0,,7.0
1997,7.0,7.0,222.0,7.0
1998,7.0,7.0,225.0,7.0
1999,7.0,7.0,228.0,7.0
2000,7.0,7.0,220.0,7.0
2001,7.0,7.0,225.0,7.0
2002,7.0,7.0,223.0,7.0
2003,7.0,7.0,226.0,7.0
2004,7.0,7.0,228.0,7.0
2005,7.0,7.0,228.0,7.0


### 데이터 수집

In [4]:
# name = 'KOSPI'
# symbol = '^KS11'

name = 'ETF반도체'
symbol = '091160.KS'

# ohlcv 데이터 수집 (전체 기간)
data = preprocessing.get_data(symbol, drop=True)
data

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Adj Close,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-01-30,8771.014648,-0.656804,else,2007.0
2007-01-31,8650.201172,-1.377417,else,2007.0
2007-02-01,8742.020508,1.061471,else,2007.0
2007-02-02,8988.478516,2.819234,else,2007.0
2007-02-05,9118.955078,1.451598,else,2007.0
...,...,...,...,...
2023-12-28,36275.000000,0.165677,santa,2023.0
2024-01-02,36840.000000,1.557547,santa,2023.0
2024-01-03,35915.000000,-2.510858,santa,2023.0
2024-01-04,35815.000000,-0.278435,after,2023.0


#### 데이터 기간 설정

In [5]:
# date_start = '2000-01-01'
# date_end = '2021-12-31'

# data21 = utils.filter_date(data, date_start, date_end)
# data21

In [6]:
data21 = utils.filter_range(data, '귀속년도', 2009, 2021)
data21

Unnamed: 0_level_0,Adj Close,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-04-17,10452.729492,14.444442,else,2009.0
2009-04-20,10510.719727,0.554786,else,2009.0
2009-04-21,10670.194336,1.517257,else,2009.0
2009-04-22,11129.280273,4.302508,else,2009.0
2009-04-23,11076.124023,-0.477625,else,2009.0
...,...,...,...,...
2022-01-10,37381.761719,-2.155842,after,2021.0
2022-01-11,37500.843750,0.318556,after,2021.0
2022-01-12,38289.757812,2.103723,after,2021.0
2022-01-13,38354.261719,0.168463,after,2021.0


In [7]:
data21.pivot_table(
    index = '귀속년도',
    columns='event',
    values = 'rtn',
    aggfunc='count'
)

event,after,before,else,santa
귀속년도,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009.0,7,7,163,7
2010.0,7,7,230,7
2011.0,7,7,226,7
2012.0,7,7,227,7
2013.0,7,7,226,7
2014.0,7,7,224,7
2015.0,7,7,226,7
2016.0,7,7,225,7
2017.0,7,7,220,7
2018.0,6,7,220,7


#### 최종 데이터 정보(요약통계량)

In [8]:
data21.head()

Unnamed: 0_level_0,Adj Close,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2009-04-17,10452.729492,14.444442,else,2009.0
2009-04-20,10510.719727,0.554786,else,2009.0
2009-04-21,10670.194336,1.517257,else,2009.0
2009-04-22,11129.280273,4.302508,else,2009.0
2009-04-23,11076.124023,-0.477625,else,2009.0


In [9]:
data21.tail()

Unnamed: 0_level_0,Adj Close,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-10,37381.761719,-2.155842,after,2021.0
2022-01-11,37500.84375,0.318556,after,2021.0
2022-01-12,38289.757812,2.103723,after,2021.0
2022-01-13,38354.261719,0.168463,after,2021.0
2022-01-14,38547.769531,0.504528,after,2021.0


In [10]:
data21[['rtn']].describe()

Unnamed: 0,rtn
count,3136.0
mean,0.059798
std,1.6058
min,-9.116665
25%,-0.806293
50%,0.073235
75%,0.938009
max,14.444442


## 산타랠리 비교

In [11]:
data21['event'].value_counts()

event
else      2864
before      91
santa       91
after       90
Name: count, dtype: int64

### 일 평균 수익률 비교

In [12]:
data21.groupby('event')[['rtn']].agg(['mean', 'std', 'min', 'max', 'count'])

Unnamed: 0_level_0,rtn,rtn,rtn,rtn,rtn
Unnamed: 0_level_1,mean,std,min,max,count
event,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
after,0.056915,1.336648,-2.525867,5.624808,90
before,0.119637,1.409403,-4.623476,3.342218,91
else,0.042646,1.620027,-9.116665,14.444442,2864
santa,0.542632,1.526817,-3.989518,3.177795,91


### T-Test & Mann-Whitney U Test

In [13]:
df = data21

rtns_before = df[df['event']=='before']['rtn']
rtns_santa = df[df['event']=='santa']['rtn']
rtns_after = df[df['event']=='after']['rtn']

rtns_before.shape, rtns_santa.shape, rtns_after.shape

((91,), (91,), (90,))

#### H1 : santa > before

In [14]:
import pandas as pd

res_test = pd.DataFrame(
    columns = ['T-Test', 'mann-Whitney U Test'],
    index = ['Statistic', 'p-value', 'df']
)

In [15]:
# T-Test
# H1 : mean(santa) > mean(before)

res_ttest = ttest_ind(
    rtns_santa, rtns_before, 
    equal_var=True, alternative='greater'
)
t_statistic = res_ttest.statistic
t_pvalue = res_ttest.pvalue
t_df = int(res_ttest.df)

res_test['T-Test'] = [t_statistic, t_pvalue, t_df]

In [16]:
# U-Test
# H1 : mean(santa) > mean(before)

res_utest = mannwhitneyu(
    rtns_santa, rtns_before,
    alternative='greater'
)
u_statistic = res_utest.statistic
u_pvalue = res_utest.pvalue

res_test['mann-Whitney U Test'] = [u_statistic, u_pvalue, '']

In [17]:
# 검정 결과
#  H1 : santa > before

res_test

Unnamed: 0,T-Test,mann-Whitney U Test
Statistic,1.941938,4996.0
p-value,0.026853,0.008065
df,180.0,


#### H1 : santa > after

In [18]:
# res_test = pd.DataFrame(
#     columns = ['T-Test', 'mann-Whitney U Test'],
#     index = ['Statistic', 'p-value', 'df']
# )

# res_ttest = ttest_ind(
#     rtns_santa, rtns_after, 
#     equal_var=True, alternative='greater'
# )
# t_statistic = res_ttest.statistic
# t_pvalue = res_ttest.pvalue
# t_df = int(res_ttest.df)

# res_test['T-Test'] = [t_statistic, t_pvalue, t_df]

# res_utest = mannwhitneyu(
#     rtns_santa, rtns_after,
#     alternative='greater'
# )
# u_statistic = res_utest.statistic
# u_pvalue = res_utest.pvalue

# res_test['mann-Whitney U Test'] = [u_statistic, u_pvalue, '']

# # 검정결과
# res_test

### OLS Regression

In [19]:
res_ols = pd.DataFrame(
    index = [
        name, 'T-statistics', 'p-values'
    ],
    columns = [
        'Constant', 'Santa Rally Days', 'F-Value', 'Significance'
    ]
).fillna('')

In [20]:
_df = df[(df['event']=='santa') | (df['event']=='before')]
_df['event'].value_counts()

event
before    91
santa     91
Name: count, dtype: int64

In [21]:
y = _df['rtn']
x = np.where(_df['event']=='santa', 1, 0)
x = sm.add_constant(x)

model = sm.OLS(y, x)
results = model.fit()

res_ols.loc[name] = results.params.values.tolist() + [results.fvalue, results.f_pvalue]
res_ols.loc['T-statistics'].iloc[:2] = results.tvalues.values.tolist()
res_ols.loc['p-values'].iloc[:2] = results.pvalues.values.tolist()

In [22]:
results.summary()

0,1,2,3
Dep. Variable:,rtn,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.015
Method:,Least Squares,F-statistic:,3.771
Date:,"Sat, 06 Jan 2024",Prob (F-statistic):,0.0537
Time:,16:55:39,Log-Likelihood:,-327.27
No. Observations:,182,AIC:,658.5
Df Residuals:,180,BIC:,664.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1196,0.154,0.777,0.438,-0.184,0.424
x1,0.4230,0.218,1.942,0.054,-0.007,0.853

0,1,2,3
Omnibus:,10.697,Durbin-Watson:,1.974
Prob(Omnibus):,0.005,Jarque-Bera (JB):,11.247
Skew:,-0.508,Prob(JB):,0.00361
Kurtosis:,3.672,Cond. No.,2.62


In [23]:
res_ols

Unnamed: 0,Constant,Santa Rally Days,F-Value,Significance
ETF반도체,0.119637,0.422995,3.771123,0.053706
T-statistics,0.776746,1.941938,,
p-values,0.438328,0.053706,,
