## Pandas 의 시계열 데이터 처리 주요 함수

In [40]:
import warnings
warnings.filterwarnings('ignore')

In [63]:
import pandas as pd
import numpy as np

# 날짜 범위 생성
date_rng = pd.date_range(start='2022-01-01', end='2022-12-31', freq='D')

# 날짜 문열 범위 생성
date_str = date_rng.strftime('%m/%d/%Y').tolist()

# 예시 데이터 생성
np.random.seed(42)  # 재현성을 위해 시드 설정
data = np.random.randn(len(date_rng))  # 랜덤 데이터 생성

# 데이터프레임 생성
df = pd.DataFrame([date_rng, date_str], index=['date', 'date_str']).T
df['value'] = data

# 데이터프레임 출력
print(df.head())


                  date    date_str     value
0  2022-01-01 00:00:00  01/01/2022  0.496714
1  2022-01-02 00:00:00  01/02/2022 -0.138264
2  2022-01-03 00:00:00  01/03/2022  0.647689
3  2022-01-04 00:00:00  01/04/2022  1.523030
4  2022-01-05 00:00:00  01/05/2022 -0.234153


### 1. `to_datetime`
-  문자열이나 다른 형식의 데이터를 datetime 객체로 변환

In [64]:
pd.to_datetime(df['date_str'])

0     2022-01-01
1     2022-01-02
2     2022-01-03
3     2022-01-04
4     2022-01-05
         ...    
360   2022-12-27
361   2022-12-28
362   2022-12-29
363   2022-12-30
364   2022-12-31
Name: date_str, Length: 365, dtype: datetime64[ns]


### 2. `date_range`
-  지정한 범위의 날짜 데이터 Series를 생성


In [65]:
dates = pd.date_range(start='2024-01-01', end='2024-12-31', freq='D')
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10',
               ...
               '2024-12-22', '2024-12-23', '2024-12-24', '2024-12-25',
               '2024-12-26', '2024-12-27', '2024-12-28', '2024-12-29',
               '2024-12-30', '2024-12-31'],
              dtype='datetime64[ns]', length=366, freq='D')

 ### 3. `resample`
 -  시계열 데이터를 새로운 주기로 변환
    
    | 주기 코드 | 설명                |
    |------------|---------------------|
    | 'D'        | 일별                |
    | 'W'        | 주별                |
    | 'M'        | 월별                |
    | 'Q'        | 분기별              |
    | 'A'        | 연별                |
    | 'H'        | 시간별              |
    | 'T'        | 분별                |
    | 'S'        | 초별                |

In [66]:
sdf = df.set_index('date')
print('---- original data ----')
print(sdf.head(10))
sdf_resampled = sdf['value'].resample('MS').mean() # 시작일을 월초 기준으로 월별 평균값 계산
print('---- resampled data ----')
print(sdf_resampled.head(10))

---- original data ----
              date_str     value
date                            
2022-01-01  01/01/2022  0.496714
2022-01-02  01/02/2022 -0.138264
2022-01-03  01/03/2022  0.647689
2022-01-04  01/04/2022  1.523030
2022-01-05  01/05/2022 -0.234153
2022-01-06  01/06/2022 -0.234137
2022-01-07  01/07/2022  1.579213
2022-01-08  01/08/2022  0.767435
2022-01-09  01/09/2022 -0.469474
2022-01-10  01/10/2022  0.542560
---- resampled data ----
date
2022-01-01   -0.201488
2022-02-01   -0.143168
2022-03-01    0.043938
2022-04-01   -0.020252
2022-05-01   -0.083815
2022-06-01    0.291292
2022-07-01    0.098341
2022-08-01   -0.021537
2022-09-01   -0.091070
2022-10-01    0.064996
Freq: MS, Name: value, dtype: float64


### 4. `rolling`
-  롤링 윈도우 계산을 수행 
-  롤링 윈도우 : 시계열 데이터에서 일정한 크기의 윈도우를 이동시키며 계산을 수행하는 방법.
-  이동평균(MA) 계산도 수행 가능
-  롤링 윈도우의 예시

    | 인덱스 | 값  | 윈도우 (크기=3) | 롤링 평균 |
    |--------|-----|------------------|-----------|
    | 0      | 10  | NaN              | NaN       |
    | 1      | 20  | NaN              | NaN       |
    | 2      | 30  | [10, 20, 30]     | 20.0      |
    | 3      | 40  | [20, 30, 40]     | 30.0      |
    | 4      | 50  | [30, 40, 50]     | 40.0      |

-  위 표에서 볼 수 있듯이, 윈도우 크기가 3인 경우 처음 두 개의 값은 NaN
-  세 번째 값부터는 윈도우 내의 값들의 평균을 계산하여 롤링 평균을 계산


In [67]:
df['rolling_mean'] = df['value'].rolling(window=7).mean()
df.head(10)

Unnamed: 0,date,date_str,value,rolling_mean
0,2022-01-01 00:00:00,01/01/2022,0.496714,
1,2022-01-02 00:00:00,01/02/2022,-0.138264,
2,2022-01-03 00:00:00,01/03/2022,0.647689,
3,2022-01-04 00:00:00,01/04/2022,1.52303,
4,2022-01-05 00:00:00,01/05/2022,-0.234153,
5,2022-01-06 00:00:00,01/06/2022,-0.234137,
6,2022-01-07 00:00:00,01/07/2022,1.579213,0.520013
7,2022-01-08 00:00:00,01/08/2022,0.767435,0.558687
8,2022-01-09 00:00:00,01/09/2022,-0.469474,0.511372
9,2022-01-10 00:00:00,01/10/2022,0.54256,0.496353


### 4-1. `rolling.apply`
-  사용자 정의 함수를 롤링 윈도우에 적용

In [None]:
import numpy as np

# 사용자 정의 함수 정의 ( 공분산 계산 )
def custom_cov(window):
    return np.cov(window)

# rolling.apply 예시코드
custom_rolling_cov = df['value'].rolling(window=7).apply(custom_cov)
custom_rolling_cov.head(20)


0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
5          NaN
6     0.619272
7     0.627639
8     0.720256
9     0.717058
10    0.600956
11    0.643101
12    0.609839
13    0.802016
14    0.842970
15    0.839931
16    0.582413
17    0.767185
18    0.755900
19    0.575028
Name: value, dtype: float64

### 5. `shift`
-  데이터를 다음 또는 이전 타임스텝으로 이동.

In [68]:
sdf = df.set_index('date')
sdf['shifted_forward1'] = sdf['value'].shift(1).values
sdf['shifted_backward1'] = sdf['value'].shift(-1).values
sdf[['value', 'shifted_forward1', 'shifted_backward1']].head(10)

Unnamed: 0_level_0,value,shifted_forward1,shifted_backward1
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,0.496714,,-0.138264
2022-01-02,-0.138264,0.496714,0.647689
2022-01-03,0.647689,-0.138264,1.52303
2022-01-04,1.52303,0.647689,-0.234153
2022-01-05,-0.234153,1.52303,-0.234137
2022-01-06,-0.234137,-0.234153,1.579213
2022-01-07,1.579213,-0.234137,0.767435
2022-01-08,0.767435,1.579213,-0.469474
2022-01-09,-0.469474,0.767435,0.54256
2022-01-10,0.54256,-0.469474,-0.463418


### 6. `diff`
-  차분 계산
-  차분 이란? : 시계열 데이터에서 현재 값과 이전 값의 차이를 계산하는 방법

In [69]:
df['diff1'] = df['value'].diff()
df['diff2'] = df['value'].diff(2)
df[['date', 'value', 'diff1', 'diff2']].head(10)

Unnamed: 0,date,value,diff1,diff2
0,2022-01-01 00:00:00,0.496714,,
1,2022-01-02 00:00:00,-0.138264,-0.634978,
2,2022-01-03 00:00:00,0.647689,0.785953,0.150974
3,2022-01-04 00:00:00,1.52303,0.875341,1.661294
4,2022-01-05 00:00:00,-0.234153,-1.757183,-0.881842
5,2022-01-06 00:00:00,-0.234137,1.6e-05,-1.757167
6,2022-01-07 00:00:00,1.579213,1.81335,1.813366
7,2022-01-08 00:00:00,0.767435,-0.811778,1.001572
8,2022-01-09 00:00:00,-0.469474,-1.236909,-2.048687
9,2022-01-10 00:00:00,0.54256,1.012034,-0.224875


### 7. `timedelta_range`
-  일정 간격의 시간 델타 범위를 생성

In [70]:
timedelta_range = pd.timedelta_range(start='0 days', end='2 days', freq='2h')
timedelta_range

TimedeltaIndex(['0 days 00:00:00', '0 days 02:00:00', '0 days 04:00:00',
                '0 days 06:00:00', '0 days 08:00:00', '0 days 10:00:00',
                '0 days 12:00:00', '0 days 14:00:00', '0 days 16:00:00',
                '0 days 18:00:00', '0 days 20:00:00', '0 days 22:00:00',
                '1 days 00:00:00', '1 days 02:00:00', '1 days 04:00:00',
                '1 days 06:00:00', '1 days 08:00:00', '1 days 10:00:00',
                '1 days 12:00:00', '1 days 14:00:00', '1 days 16:00:00',
                '1 days 18:00:00', '1 days 20:00:00', '1 days 22:00:00',
                '2 days 00:00:00'],
               dtype='timedelta64[ns]', freq='2h')

### 8. `asfreq`
-  특정 주기로 시계열 데이터를 반환

In [43]:
df.set_index('date').asfreq('W').head(10)

Unnamed: 0_level_0,date_str,value,rolling_mean,shifted_forward1,shifted_backward1,diff1,diff2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-02,01/02/2022,-0.138264,,0.496714,0.647689,-0.634978,
2022-01-09,01/09/2022,-0.469474,0.511372,0.767435,0.54256,-1.236909,-2.048687
2022-01-16,01/16/2022,-0.562288,-0.62073,-1.724918,-1.012831,1.16263,1.350993
2022-01-23,01/23/2022,0.067528,-0.244502,-0.225776,-1.424748,0.293305,-1.398121
2022-01-30,01/30/2022,-0.291694,-0.503691,-0.600639,-0.601707,0.308945,-0.667392
2022-02-06,02/06/2022,0.208864,-0.001439,-1.220844,-1.95967,1.429707,-0.613681
2022-02-13,02/13/2022,-0.301104,-0.37113,-0.115648,-1.478522,-0.185455,-0.472472
2022-02-20,02/20/2022,0.324084,-0.385317,-1.76304,-0.385082,2.087124,-0.019534
2022-02-27,02/27/2022,-0.309212,0.051932,-0.839218,0.331263,0.530005,-1.240492
2022-03-06,03/06/2022,0.812526,-0.121149,-1.196207,1.35624,2.008732,1.918861


### 9. `truncate`
-  시계열 데이터의 특정 구간을 잘라오기

In [71]:
sdf = df.set_index('date')
cut_df = sdf.truncate(before='2022-01-07', after='2022-01-14').head(10)
cut_df


Unnamed: 0_level_0,date_str,value,rolling_mean,diff1,diff2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-07,01/07/2022,1.579213,0.520013,1.81335,1.813366
2022-01-08,01/08/2022,0.767435,0.558687,-0.811778,1.001572
2022-01-09,01/09/2022,-0.469474,0.511372,-1.236909,-2.048687
2022-01-10,01/10/2022,0.54256,0.496353,1.012034,-0.224875
2022-01-11,01/11/2022,-0.463418,0.212575,-1.005978,0.006057
2022-01-12,01/12/2022,-0.46573,0.179493,-0.002312,-1.00829
2022-01-13,01/13/2022,0.241962,0.247507,0.707692,0.70538
2022-01-14,01/14/2022,-1.91328,-0.251421,-2.155243,-1.44755


### 10. `period_range`
-  특정 기간의 범위를 생성

In [72]:
sales_data = {
    'period': pd.period_range(start='2020-01', end='2020-12', freq='M'),
    'sales': [1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600]
}
sales_df = pd.DataFrame(sales_data)
sales_df.set_index('period', inplace=True)
sales_df

Unnamed: 0_level_0,sales
period,Unnamed: 1_level_1
2020-01,1500
2020-02,1600
2020-03,1700
2020-04,1800
2020-05,1900
2020-06,2000
2020-07,2100
2020-08,2200
2020-09,2300
2020-10,2400


### 11. `dt` 접근자
-  datetime 속성에 접근할 수 있게 합니다.

In [59]:
pd.to_datetime(df['date'])


0     2022-01-01
1     2022-01-02
2     2022-01-03
3     2022-01-04
4     2022-01-05
         ...    
360   2022-12-27
361   2022-12-28
362   2022-12-29
363   2022-12-30
364   2022-12-31
Name: date, Length: 365, dtype: datetime64[ns]

In [73]:
sdf = df[['date', 'value']]
sdf['date'] = pd.to_datetime(sdf['date'])
sdf['year'] = sdf['date'].dt.year
sdf['month'] = sdf['date'].dt.month
sdf['day'] = sdf['date'].dt.day
sdf['weekday'] = sdf['date'].dt.weekday
sdf[['date', 'year', 'month', 'day', 'weekday']].head(10)

Unnamed: 0,date,year,month,day,weekday
0,2022-01-01,2022,1,1,5
1,2022-01-02,2022,1,2,6
2,2022-01-03,2022,1,3,0
3,2022-01-04,2022,1,4,1
4,2022-01-05,2022,1,5,2
5,2022-01-06,2022,1,6,3
6,2022-01-07,2022,1,7,4
7,2022-01-08,2022,1,8,5
8,2022-01-09,2022,1,9,6
9,2022-01-10,2022,1,10,0


### 12. `TimeGrouper`
-  시계열 데이터를 특정 주기로 그룹화

In [79]:
sdf = df[['date', 'value']].copy()
sdf.date = pd.to_datetime(sdf.date)

In [81]:
sdf.groupby(pd.Grouper(key='date', freq='M')).mean()

Unnamed: 0_level_0,value
date,Unnamed: 1_level_1
2022-01-31,-0.201488
2022-02-28,-0.143168
2022-03-31,0.043938
2022-04-30,-0.020252
2022-05-31,-0.083815
2022-06-30,0.291292
2022-07-31,0.098341
2022-08-31,-0.021537
2022-09-30,-0.09107
2022-10-31,0.064996


### 13. `merge_asof`
-  가장 가까운 시계열 데이터로 병합
-  결측치 처리에 유용

In [82]:
left = pd.DataFrame({
    'date': pd.to_datetime(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04']),
    'value_left': [10, 20, 30, 40]
})

right = pd.DataFrame({
    'date': pd.to_datetime(['2022-01-02', '2022-01-03', '2022-01-05']),
    'value_right': [15, 25, 35]
})

merged = pd.merge_asof(left, right, on='date')
print(merged)

        date  value_left  value_right
0 2022-01-01          10          NaN
1 2022-01-02          20         15.0
2 2022-01-03          30         25.0
3 2022-01-04          40         25.0


### 14. `expanding`
-  누적 계산

In [85]:
df['expanding_sum'] = df['value'].expanding().sum()
df['expanding_mean'] = df['value'].expanding().mean()
df[['date', 'value', 'expanding_sum', 'expanding_mean']].head(10)

Unnamed: 0,date,value,expanding_sum,expanding_mean
0,2022-01-01 00:00:00,0.496714,0.496714,0.496714
1,2022-01-02 00:00:00,-0.138264,0.35845,0.179225
2,2022-01-03 00:00:00,0.647689,1.006138,0.335379
3,2022-01-04 00:00:00,1.52303,2.529168,0.632292
4,2022-01-05 00:00:00,-0.234153,2.295015,0.459003
5,2022-01-06 00:00:00,-0.234137,2.060878,0.34348
6,2022-01-07 00:00:00,1.579213,3.640091,0.520013
7,2022-01-08 00:00:00,0.767435,4.407525,0.550941
8,2022-01-09 00:00:00,-0.469474,3.938051,0.437561
9,2022-01-10 00:00:00,0.54256,4.480611,0.448061


### 15. `cumsum`
-  누적 합계를 계산

In [86]:
df['cumsum'] = df['value'].cumsum()
df[['date', 'value', 'cumsum']].head(10)

Unnamed: 0,date,value,cumsum
0,2022-01-01 00:00:00,0.496714,0.496714
1,2022-01-02 00:00:00,-0.138264,0.35845
2,2022-01-03 00:00:00,0.647689,1.006138
3,2022-01-04 00:00:00,1.52303,2.529168
4,2022-01-05 00:00:00,-0.234153,2.295015
5,2022-01-06 00:00:00,-0.234137,2.060878
6,2022-01-07 00:00:00,1.579213,3.640091
7,2022-01-08 00:00:00,0.767435,4.407525
8,2022-01-09 00:00:00,-0.469474,3.938051
9,2022-01-10 00:00:00,0.54256,4.480611


### 16. `ewm`
 -  지수 가중 이동 평균(Exponential Weighted Moving Average, EWMA) 계산

 -  최근 데이터에 더 큰 가중치를 부여하여 평균을 계산하는 방법

In [87]:
df['ewm'] = df['value'].ewm(span=7).mean()
df[['date', 'value', 'ewm']].head(10)

Unnamed: 0,date,value,ewm
0,2022-01-01 00:00:00,0.496714,0.496714
1,2022-01-02 00:00:00,-0.138264,0.133869
2,2022-01-03 00:00:00,0.647689,0.356061
3,2022-01-04 00:00:00,1.52303,0.782838
4,2022-01-05 00:00:00,-0.234153,0.449484
5,2022-01-06 00:00:00,-0.234137,0.241575
6,2022-01-07 00:00:00,1.579213,0.627499
7,2022-01-08 00:00:00,0.767435,0.666375
8,2022-01-09 00:00:00,-0.469474,0.359361
9,2022-01-10 00:00:00,0.54256,0.407894
