# Datetime Range 객체 활용

In [24]:
import os
os.environ["MKL_NUM_THREADS"] = "1" 
os.environ["NUMEXPR_NUM_THREADS"] = "1" 
os.environ["OMP_NUM_THREADS"] = "1" 

# 필요한 패키지 설치
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta

In [3]:
index = pd.date_range('2016-01-01', periods=365*5+2, freq='1D')
index

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10',
               ...
               '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25',
               '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29',
               '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', length=1827, freq='D')

In [4]:
dummy=pd.DataFrame({'data':[1]*(365*5+2)}, index=index)

In [6]:
dummy.head()

Unnamed: 0,data
2016-01-01,1
2016-01-02,1
2016-01-03,1
2016-01-04,1
2016-01-05,1


In [10]:
dummy['week_idx'] = index.to_period('W')

In [11]:
dummy.head()

Unnamed: 0,data,week_idx
2016-01-01,1,2015-12-28/2016-01-03
2016-01-02,1,2015-12-28/2016-01-03
2016-01-03,1,2015-12-28/2016-01-03
2016-01-04,1,2016-01-04/2016-01-10
2016-01-05,1,2016-01-04/2016-01-10


In [13]:
type(dummy['week_idx'][0])

pandas._libs.tslibs.period.Period

In [14]:
# read_csv() 함수로 파일 읽어와서 df로 변환
df = pd.read_csv('./data/all_stocks_5yr.csv')
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619040 entries, 0 to 619039
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    619040 non-null  object 
 1   open    619029 non-null  float64
 2   high    619032 non-null  float64
 3   low     619032 non-null  float64
 4   close   619040 non-null  float64
 5   volume  619040 non-null  int64  
 6   Name    619040 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 33.1+ MB


In [17]:
# 문자열인 날짜 데이터를 판다스 Timestamp로 변환
df['dttm'] = pd.to_datetime(df['date'])   # 새로운 열에 추가
df.set_index('dttm', inplace=True)        # 행 인덱스로 지정
df.head()

Unnamed: 0_level_0,date,open,high,low,close,volume,Name
dttm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-02-08,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
2013-02-11,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2013-02-12,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
2013-02-13,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
2013-02-14,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [27]:
# 날짜 인덱스를 이용하여 데이터 선택하기
df_y = df.loc['2018']
print(df_y.tail())

                  date   open   high    low  close   volume Name
dttm                                                            
2018-02-01  2018-02-01  76.84  78.27  76.69  77.82  2982259  ZTS
2018-02-02  2018-02-02  77.53  78.12  76.73  76.78  2595187  ZTS
2018-02-05  2018-02-05  76.64  76.92  73.18  73.83  2962031  ZTS
2018-02-06  2018-02-06  72.74  74.56  72.13  73.27  4924323  ZTS
2018-02-07  2018-02-07  72.70  75.00  72.69  73.86  4534912  ZTS


In [30]:
df.index

DatetimeIndex(['2013-02-08', '2013-02-11', '2013-02-12', '2013-02-13',
               '2013-02-14', '2013-02-15', '2013-02-19', '2013-02-20',
               '2013-02-21', '2013-02-22',
               ...
               '2018-01-25', '2018-01-26', '2018-01-29', '2018-01-30',
               '2018-01-31', '2018-02-01', '2018-02-02', '2018-02-05',
               '2018-02-06', '2018-02-07'],
              dtype='datetime64[ns]', name='dttm', length=619040, freq=None)

In [66]:
before_range = pd.period_range(start = today - timedelta(days=11),
             end = today - timedelta(days=1))
after_range = pd.period_range(start = today,
                             end = today + timedelta(days=10))
df_y = df_y[df_y['Name'] == 'AAPL']

print(before_range)
print(after_range)

before_data = df_y[df_y.index.isin(before_range.to_timestamp())]['high']
print(before_data)
after_data = df_y[df_y.index.isin(after_range.to_timestamp())]['high']
print(after_data)

PeriodIndex(['2017-12-21', '2017-12-22', '2017-12-23', '2017-12-24',
             '2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
             '2017-12-29', '2017-12-30', '2017-12-31'],
            dtype='period[D]')
PeriodIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
             '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
             '2018-01-09', '2018-01-10', '2018-01-11'],
            dtype='period[D]')
dttm
2017-12-21    176.020
2017-12-22    175.424
2017-12-26    171.470
2017-12-27    170.780
2017-12-28    171.850
2017-12-29    170.590
Name: high, dtype: float64
dttm
2018-01-02    172.3000
2018-01-03    174.5500
2018-01-04    173.4700
2018-01-05    175.3700
2018-01-08    175.6100
2018-01-09    175.0600
2018-01-10    174.3000
2018-01-11    175.4886
Name: high, dtype: float64


In [67]:
from scipy import stats
from scipy.stats import ttest_ind

In [68]:
stats.bartlett(before_data, after_data) # 귀무가설 등분산이다. 기각하지 못하면 등분산이라고 보아도 되는 것이다.

BartlettResult(statistic=2.906108803590229, pvalue=0.08824454955208097)

In [69]:
# ttest_ind(before_data, after_data) # default equal variance

In [70]:
if stats.bartlett(before_data, after_data)[1] >= 0.05:
    homoschedasty = True
else:
    homoschedasty = False
ttest_ind(before_data, after_data, equal_var=homoschedasty) # welch's t-test
# return (1.4927289925706944, 0.16970867501294376) t-statistic and p-value

Ttest_indResult(statistic=-1.9028221426182836, pvalue=0.08132610986278331)