# 시계열 처리

In [1]:
import numpy as np
import pandas as pd

In [3]:
idx = pd.DatetimeIndex(['2023-05-01','2023-05-02','2023-05-03','2023-05-04','2023-05-05'])
s = pd.Series([0,1,2,3,4], index=idx)

In [6]:
s

2023-05-01    0
2023-05-02    1
2023-05-03    2
2023-05-04    3
2023-05-05    4
dtype: int64

In [5]:
s['2023']

2023-05-01    0
2023-05-02    1
2023-05-03    2
2023-05-04    3
2023-05-05    4
dtype: int64

## 시계열 데이터 구조

In [7]:
from datetime import datetime

In [20]:
# 다양한 방식으로 date 선언 가능
dates = pd.to_datetime(['05-01-2023',datetime(2023,5,2),'3nd of May,2023','2023-may-4','20230504'])
dates

DatetimeIndex(['2023-05-01', '2023-05-02', '2023-05-03', '2023-05-04',
               '2023-05-04'],
              dtype='datetime64[ns]', freq=None)

In [18]:
# dtype을 'Day'로 변경
dates.to_period('D')

PeriodIndex(['2023-05-01', '2023-05-02', '2023-05-03', '2023-05-04',
             '2023-05-04'],
            dtype='period[D]')

In [16]:
dates = pd.to_datetime(['05-01-2023',datetime(2023,6,2),'3nd of July,2023','2023-august-4','20230904'])
dates-dates[0]

TimedeltaIndex(['0 days', '32 days', '63 days', '95 days', '126 days'], dtype='timedelta64[ns]', freq=None)

In [22]:
# date_range를 통해 arange 기능처럼 사용 가능
pd.date_range('2023-01-01',datetime(2023,5,1))

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-04-22', '2023-04-23', '2023-04-24', '2023-04-25',
               '2023-04-26', '2023-04-27', '2023-04-28', '2023-04-29',
               '2023-04-30', '2023-05-01'],
              dtype='datetime64[ns]', length=121, freq='D')

In [24]:
pd.date_range('01-01-2023',periods=4)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'], dtype='datetime64[ns]', freq='D')

In [27]:
pd.date_range('01-01-2023', periods=12, freq='M')

DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30', '2023-12-31'],
              dtype='datetime64[ns]', freq='M')

In [28]:
pd.date_range('01-01-2023', periods=10, freq='H')

DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 01:00:00',
               '2023-01-01 02:00:00', '2023-01-01 03:00:00',
               '2023-01-01 04:00:00', '2023-01-01 05:00:00',
               '2023-01-01 06:00:00', '2023-01-01 07:00:00',
               '2023-01-01 08:00:00', '2023-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [31]:
idx = pd.to_datetime(['2023-05-12 12:00:00','2023-05-12 21:00:00'] + [None])

In [32]:
pd.isnull(idx)

array([False, False,  True])

## 시계열 기본

In [45]:
dates = [datetime(2023,1,1),datetime(2023,2,1),datetime(2023,3,1),datetime(2023,4,1),datetime(2023,5,1)]

In [50]:
ts = pd.Series(np.random.rand(5),index=dates)
ts

2023-01-01    0.235561
2023-02-01    0.711092
2023-03-01    0.769904
2023-04-01    0.800735
2023-05-01    0.686720
dtype: float64

In [51]:
ts.index

DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
               '2023-05-01'],
              dtype='datetime64[ns]', freq=None)

In [57]:
ts["20230101"],ts["01-01-2023"],ts["1nd of Jan,2023"],ts["1/1/2023"]

(0.23556065174403118,
 0.23556065174403118,
 0.23556065174403118,
 0.23556065174403118)

In [61]:
ts = pd.Series(np.random.randint(1,100,100),
         index = pd.date_range("2023-05-01",periods=100,freq='D'))
ts

2023-05-01    11
2023-05-02    10
2023-05-03    82
2023-05-04    37
2023-05-05    26
              ..
2023-08-04    14
2023-08-05    55
2023-08-06    70
2023-08-07    62
2023-08-08    27
Freq: D, Length: 100, dtype: int32

In [62]:
ts['2023']

2023-05-01    11
2023-05-02    10
2023-05-03    82
2023-05-04    37
2023-05-05    26
              ..
2023-08-04    14
2023-08-05    55
2023-08-06    70
2023-08-07    62
2023-08-08    27
Freq: D, Length: 100, dtype: int32

In [66]:
ts["2023-05"]

2023-05-01    11
2023-05-02    10
2023-05-03    82
2023-05-04    37
2023-05-05    26
2023-05-06    57
2023-05-07    43
2023-05-08    55
2023-05-09    33
2023-05-10    14
2023-05-11    24
2023-05-12    30
2023-05-13    78
2023-05-14    80
2023-05-15    99
2023-05-16    22
2023-05-17    63
2023-05-18    26
2023-05-19    49
2023-05-20    24
2023-05-21    36
2023-05-22    38
2023-05-23    93
2023-05-24    66
2023-05-25    84
2023-05-26    95
2023-05-27    75
2023-05-28    48
2023-05-29    75
2023-05-30    13
2023-05-31    84
Freq: D, dtype: int32

In [68]:
ts[datetime(2023,6,1):]

2023-06-01    24
2023-06-02    28
2023-06-03    23
2023-06-04    50
2023-06-05    19
              ..
2023-08-04    14
2023-08-05    55
2023-08-06    70
2023-08-07    62
2023-08-08    27
Freq: D, Length: 69, dtype: int32

In [88]:
tdf = pd.DataFrame(np.random.randn(100,4),
                   index = pd.date_range(datetime(2023,5,1),periods=100,freq='M'),
                   columns= list('ABCD')
                  )

tdf

Unnamed: 0,A,B,C,D
2023-05-31,-0.024195,0.140079,1.792515,0.771738
2023-06-30,-0.316136,0.125480,-1.322226,1.099713
2023-07-31,-1.366232,0.179371,1.972634,-0.833787
2023-08-31,-0.619396,-0.600911,1.125271,-0.492519
2023-09-30,0.464037,-0.113601,1.572085,0.945959
...,...,...,...,...
2031-04-30,0.265949,0.338222,-0.317393,0.099491
2031-05-31,0.497441,0.292675,0.250401,0.159065
2031-06-30,-0.819138,-0.642450,0.145488,-0.621157
2031-07-31,0.030934,0.434272,0.820045,0.579411


In [77]:
tdf["2024"]

  tdf["2024"]


Unnamed: 0,A,B,C,D
2024-01-31,-0.235262,-0.360952,1.312056,-1.504417
2024-02-29,0.796493,0.759729,0.116204,-0.358889
2024-03-31,-0.24397,0.358445,0.357203,-0.288139
2024-04-30,0.144146,1.267876,0.658283,-0.574603
2024-05-31,1.743211,-0.378222,-1.185546,-0.153661
2024-06-30,-0.686894,1.03618,-0.843241,-0.416184
2024-07-31,0.577035,-1.166129,-0.899284,-0.35137
2024-08-31,0.225422,0.264669,-1.046231,0.841519
2024-09-30,0.60937,0.558672,0.102154,0.358096
2024-10-31,0.634197,0.053739,-0.665849,-0.456153


In [78]:
tdf.loc["2024":"2025"]

Unnamed: 0,A,B,C,D
2024-01-31,-0.235262,-0.360952,1.312056,-1.504417
2024-02-29,0.796493,0.759729,0.116204,-0.358889
2024-03-31,-0.24397,0.358445,0.357203,-0.288139
2024-04-30,0.144146,1.267876,0.658283,-0.574603
2024-05-31,1.743211,-0.378222,-1.185546,-0.153661
2024-06-30,-0.686894,1.03618,-0.843241,-0.416184
2024-07-31,0.577035,-1.166129,-0.899284,-0.35137
2024-08-31,0.225422,0.264669,-1.046231,0.841519
2024-09-30,0.60937,0.558672,0.102154,0.358096
2024-10-31,0.634197,0.053739,-0.665849,-0.456153


In [79]:
tdf.iloc[1,:]

A   -0.021238
B   -1.313582
C    0.387178
D   -0.107727
Name: 2023-06-30 00:00:00, dtype: float64

In [82]:
ts = pd.Series(np.random.randn(5),
              index = pd.DatetimeIndex(["2023-01-01","2023-01-01","20230401",'1 aug,2023','01-04-2023']))
ts

2023-01-01   -0.722256
2023-01-01   -0.045518
2023-04-01    0.935955
2023-08-01    0.160133
2023-01-04    0.940061
dtype: float64

In [84]:
ts.groupby(level=0).mean()

2023-01-01   -0.383887
2023-01-04    0.940061
2023-04-01    0.935955
2023-08-01    0.160133
dtype: float64

In [85]:
pd.date_range('2023-05-01','2023-07-01',freq='B')  # Business 날로 쉬는 날은 제외한다.

DatetimeIndex(['2023-05-01', '2023-05-02', '2023-05-03', '2023-05-04',
               '2023-05-05', '2023-05-08', '2023-05-09', '2023-05-10',
               '2023-05-11', '2023-05-12', '2023-05-15', '2023-05-16',
               '2023-05-17', '2023-05-18', '2023-05-19', '2023-05-22',
               '2023-05-23', '2023-05-24', '2023-05-25', '2023-05-26',
               '2023-05-29', '2023-05-30', '2023-05-31', '2023-06-01',
               '2023-06-02', '2023-06-05', '2023-06-06', '2023-06-07',
               '2023-06-08', '2023-06-09', '2023-06-12', '2023-06-13',
               '2023-06-14', '2023-06-15', '2023-06-16', '2023-06-19',
               '2023-06-20', '2023-06-21', '2023-06-22', '2023-06-23',
               '2023-06-26', '2023-06-27', '2023-06-28', '2023-06-29',
               '2023-06-30'],
              dtype='datetime64[ns]', freq='B')

## 주기와 오프셋

In [89]:
pd.timedelta_range(0,periods=12,freq='D')

TimedeltaIndex([ '0 days',  '1 days',  '2 days',  '3 days',  '4 days',
                 '5 days',  '6 days',  '7 days',  '8 days',  '9 days',
                '10 days', '11 days'],
               dtype='timedelta64[ns]', freq='D')

In [97]:
pd.timedelta_range(0,periods=12,freq='T')

TimedeltaIndex(['0 days 00:00:00', '0 days 00:01:00', '0 days 00:02:00',
                '0 days 00:03:00', '0 days 00:04:00', '0 days 00:05:00',
                '0 days 00:06:00', '0 days 00:07:00', '0 days 00:08:00',
                '0 days 00:09:00', '0 days 00:10:00', '0 days 00:11:00'],
               dtype='timedelta64[ns]', freq='T')

In [102]:
pd.timedelta_range(0,periods=20,freq='1H30T2S')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:30:02', '0 days 03:00:04',
                '0 days 04:30:06', '0 days 06:00:08', '0 days 07:30:10',
                '0 days 09:00:12', '0 days 10:30:14', '0 days 12:00:16',
                '0 days 13:30:18', '0 days 15:00:20', '0 days 16:30:22',
                '0 days 18:00:24', '0 days 19:30:26', '0 days 21:00:28',
                '0 days 22:30:30', '1 days 00:00:32', '1 days 01:30:34',
                '1 days 03:00:36', '1 days 04:30:38'],
               dtype='timedelta64[ns]', freq='5402S')

## 쉬프트

In [106]:
ts = pd.Series(np.random.randn(10), 
              index = pd.date_range("230501",freq='D',periods=10))
ts        

2001-05-23   -0.710366
2001-05-24    0.344990
2001-05-25   -0.020389
2001-05-26    0.001376
2001-05-27    0.693972
2001-05-28   -0.082825
2001-05-29    0.853386
2001-05-30   -0.149946
2001-05-31   -0.739257
2001-06-01    0.594498
Freq: D, dtype: float64

In [108]:
ts.shift(1)

2001-05-23         NaN
2001-05-24   -0.710366
2001-05-25    0.344990
2001-05-26   -0.020389
2001-05-27    0.001376
2001-05-28    0.693972
2001-05-29   -0.082825
2001-05-30    0.853386
2001-05-31   -0.149946
2001-06-01   -0.739257
Freq: D, dtype: float64

In [109]:
ts.shift(-3)

2001-05-23    0.001376
2001-05-24    0.693972
2001-05-25   -0.082825
2001-05-26    0.853386
2001-05-27   -0.149946
2001-05-28   -0.739257
2001-05-29    0.594498
2001-05-30         NaN
2001-05-31         NaN
2001-06-01         NaN
Freq: D, dtype: float64

In [111]:
ts.shift(2,freq="D")

2001-05-25   -0.710366
2001-05-26    0.344990
2001-05-27   -0.020389
2001-05-28    0.001376
2001-05-29    0.693972
2001-05-30   -0.082825
2001-05-31    0.853386
2001-06-01   -0.149946
2001-06-02   -0.739257
2001-06-03    0.594498
Freq: D, dtype: float64

## 시간대 처리

In [116]:
import pytz
pytz.common_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau',
 'Africa/Blantyre',
 'Africa/Brazzaville',
 'Africa/Bujumbura',
 'Africa/Cairo',
 'Africa/Casablanca',
 'Africa/Ceuta',
 'Africa/Conakry',
 'Africa/Dakar',
 'Africa/Dar_es_Salaam',
 'Africa/Djibouti',
 'Africa/Douala',
 'Africa/El_Aaiun',
 'Africa/Freetown',
 'Africa/Gaborone',
 'Africa/Harare',
 'Africa/Johannesburg',
 'Africa/Juba',
 'Africa/Kampala',
 'Africa/Khartoum',
 'Africa/Kigali',
 'Africa/Kinshasa',
 'Africa/Lagos',
 'Africa/Libreville',
 'Africa/Lome',
 'Africa/Luanda',
 'Africa/Lubumbashi',
 'Africa/Lusaka',
 'Africa/Malabo',
 'Africa/Maputo',
 'Africa/Maseru',
 'Africa/Mbabane',
 'Africa/Mogadishu',
 'Africa/Monrovia',
 'Africa/Nairobi',
 'Africa/Ndjamena',
 'Africa/Niamey',
 'Africa/Nouakchott',
 'Africa/Ouagadougou',
 'Africa/Porto-Novo',
 'Africa/Sao_Tome',
 'Africa/Tripoli',
 'Africa/Tunis',
 'Africa/Wi

In [120]:
tz = pytz.timezone('Asia/Seoul')
tz

<DstTzInfo 'Asia/Seoul' LMT+8:28:00 STD>