### 필요 라이브러리 호출

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import yfinance as yf

from scipy.stats import ttest_ind, mannwhitneyu
import statsmodels.api as sm

import seaborn as sns
import matplotlib.pyplot as plt
import platform
if platform.system() == 'Darwin': # MAC
    plt.rc('font', family='AppleGothic')
else: # Windows
    plt.rc('font', family='Malgun Gothic')

plt.rcParams['axes.unicode_minus'] = False

In [2]:
import utils
import preprocessing

### 데이터 수집

In [3]:
name = 'KOSPI'
symbol = '117460.KS'

# ohlcv 데이터 수집 (전체 기간)
data = preprocessing.get_data(symbol)

data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-10-13,7800.0,7800.0,7630.0,7710.0,6968.276367,0,-1.153847,else,2009
2009-10-14,7730.0,7885.0,7715.0,7790.0,7040.580078,0,1.037613,else,2009
2009-10-15,7830.0,7860.0,7775.0,7805.0,7054.136719,0,0.192550,else,2009
2009-10-16,7805.0,7860.0,7725.0,7730.0,6986.352539,0,-0.960914,else,2009
2009-10-19,7800.0,7800.0,7630.0,7760.0,7013.466797,0,0.388103,else,2009
...,...,...,...,...,...,...,...,...,...
2023-12-28,15825.0,16005.0,15795.0,15975.0,15975.000000,4977,0.947867,santa,2023
2024-01-02,16000.0,16000.0,15825.0,15905.0,15905.000000,42973,-0.438185,santa,2023
2024-01-03,15730.0,15730.0,15545.0,15545.0,15545.000000,12038,-2.263439,santa,2023
2024-01-04,15315.0,15650.0,15270.0,15565.0,15565.000000,1746,0.128659,after,2023


In [4]:
# 일일수익률 계산
data = utils.create_rtn(data, col_rtn='rtn')
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2009-10-14,7730.0,7885.0,7715.0,7790.0,7040.580078,0,1.037613,else,2009
2009-10-15,7830.0,7860.0,7775.0,7805.0,7054.136719,0,0.192550,else,2009
2009-10-16,7805.0,7860.0,7725.0,7730.0,6986.352539,0,-0.960914,else,2009
2009-10-19,7800.0,7800.0,7630.0,7760.0,7013.466797,0,0.388103,else,2009
2009-10-20,7760.0,7840.0,7745.0,7800.0,7049.618164,0,0.515456,else,2009
...,...,...,...,...,...,...,...,...,...
2023-12-28,15825.0,16005.0,15795.0,15975.0,15975.000000,4977,0.947867,santa,2023
2024-01-02,16000.0,16000.0,15825.0,15905.0,15905.000000,42973,-0.438185,santa,2023
2024-01-03,15730.0,15730.0,15545.0,15545.0,15545.000000,12038,-2.263439,santa,2023
2024-01-04,15315.0,15650.0,15270.0,15565.0,15565.000000,1746,0.128659,after,2023


#### 데이터 기간 설정

In [5]:
date_start = '2010-01-01'
date_end = '2023-12-31'

df = utils.filter_date(data, date_start, date_end)
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,8200.0,8235.0,8185.0,8200.0,7411.137207,3553,0.000000,santa,2009
2010-01-05,8300.0,8340.0,8265.0,8280.0,7483.440918,41826,0.975609,santa,2009
2010-01-06,8350.0,8450.0,8350.0,8430.0,7619.010254,15584,1.811591,after,2009
2010-01-07,8500.0,8505.0,8320.0,8325.0,7524.111328,7621,-1.245555,after,2009
2010-01-08,8325.0,8420.0,8210.0,8420.0,7609.973145,6423,1.141156,after,2009
...,...,...,...,...,...,...,...,...,...
2023-12-21,16265.0,16265.0,15945.0,16080.0,16080.000000,4749,-1.470588,santa,2023
2023-12-22,16250.0,16250.0,15830.0,15835.0,15835.000000,12737,-1.523632,santa,2023
2023-12-26,15835.0,15835.0,15610.0,15680.0,15680.000000,4923,-0.978844,santa,2023
2023-12-27,15660.0,15940.0,15660.0,15825.0,15825.000000,11190,0.924745,santa,2023


#### 최종 데이터 정보(요약통계량)

In [6]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-01-04,8200.0,8235.0,8185.0,8200.0,7411.137207,3553,0.0,santa,2009
2010-01-05,8300.0,8340.0,8265.0,8280.0,7483.440918,41826,0.975609,santa,2009
2010-01-06,8350.0,8450.0,8350.0,8430.0,7619.010254,15584,1.811591,after,2009
2010-01-07,8500.0,8505.0,8320.0,8325.0,7524.111328,7621,-1.245555,after,2009
2010-01-08,8325.0,8420.0,8210.0,8420.0,7609.973145,6423,1.141156,after,2009


In [7]:
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,rtn,event,귀속년도
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-12-21,16265.0,16265.0,15945.0,16080.0,16080.0,4749,-1.470588,santa,2023
2023-12-22,16250.0,16250.0,15830.0,15835.0,15835.0,12737,-1.523632,santa,2023
2023-12-26,15835.0,15835.0,15610.0,15680.0,15680.0,4923,-0.978844,santa,2023
2023-12-27,15660.0,15940.0,15660.0,15825.0,15825.0,11190,0.924745,santa,2023
2023-12-28,15825.0,16005.0,15795.0,15975.0,15975.0,4977,0.947867,santa,2023


In [8]:
df[['rtn']].describe()

Unnamed: 0,rtn
count,3425.0
mean,0.037183
std,1.717301
min,-14.607559
25%,-0.8486
50%,0.032148
75%,0.91984
max,13.446806


## 산타랠리 비교

In [9]:
df['event'].value_counts()

event
else      3131
santa       98
after       98
before      98
Name: count, dtype: int64

In [10]:
rtns_santa = df[df['event']=='santa']['rtn']
rtns_before = df[df['event']=='before']['rtn']

rtns_santa.shape, rtns_before.shape

((98,), (98,))

### 일 평균 수익률 비교

In [11]:
df.groupby('event')[['rtn']].agg(['mean', 'std', 'min', 'max', 'count'])

Unnamed: 0_level_0,rtn,rtn,rtn,rtn,rtn
Unnamed: 0_level_1,mean,std,min,max,count
event,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
after,0.228244,1.599201,-3.636365,4.233654,98
before,0.201307,1.449956,-4.438853,4.575838,98
else,0.01841,1.73728,-14.607559,13.446806,3131
santa,0.281789,1.384107,-3.301052,6.593725,98


### T-Test & Mann-Whitney U Test

In [12]:
res_test = pd.DataFrame(
    columns = ['T-Test', 'mann-Whitney U Test'],
    index = ['Statistic', 'p-value']
)

In [13]:
res_ttest = ttest_ind(
    rtns_santa, rtns_before,
    equal_var=True, alternative='greater'
)
t_statistic = res_ttest.statistic
t_pvalue = res_ttest.pvalue
# t_df = int(res_ttest.df)

res_test['T-Test'] = [t_statistic, t_pvalue]
res_test

Unnamed: 0,T-Test,mann-Whitney U Test
Statistic,0.397468,
p-value,0.34573,


In [14]:
t_pvalue

0.3457295111905829

In [15]:
res_utest = mannwhitneyu(
    rtns_santa, rtns_before,
    alternative='greater'
)
u_statistic = res_utest.statistic
u_pvalue = res_utest.pvalue

res_test['mann-Whitney U Test'] = [u_statistic, u_pvalue]

In [16]:
res_test

Unnamed: 0,T-Test,mann-Whitney U Test
Statistic,0.397468,4890.5
p-value,0.34573,0.412303
