In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

from datetime import datetime
from dateutil.relativedelta import relativedelta
import time, calendar

import holidays
from pytimekr import pytimekr

In [41]:
df = pd.read_csv('./day03_data/Sales.csv')
df = df[['SalesDate', 'SalesQty']]
df.tail()

Unnamed: 0,SalesDate,SalesQty
702,2014-06-21,1
703,2014-06-22,1
704,2014-06-26,1
705,2014-06-29,1
706,2014-06-30,1


In [23]:
# 예측 시작 연월, 개월 수 설정
ref_month = '2014-05'
pred_month = 2

In [39]:
df['SalesDate'] = df['SalesDate'].astype(str)   # SalesDate 컬럼을 str type으로 변환
df['SalesMonth'] = df['SalesDate'].str[0:7]     # SalesMonth 컬럼에 SalesDate의 str type을 0 ~ 6번 인덱스까지 불러옴
df.head()

Unnamed: 0,SalesDate,SalesQty,SalesMonth
0,2013-05-30,8,2013-05
1,2013-05-30,33,2013-05
2,2013-05-30,11,2013-05
3,2013-05-30,8,2013-05
4,2013-05-30,9,2013-05


In [40]:
df = df.groupby('SalesMonth', as_index = False).sum()
df.head()

Unnamed: 0,SalesMonth,SalesQty
0,2013-05,222
1,2013-06,340
2,2013-07,404
3,2013-08,254
4,2013-09,286


In [26]:
# 우리나라 공휴일 라이브러리
kr_holidays = pytimekr.holidays()
kr_holidays

[datetime.date(2023, 1, 21),
 datetime.date(2023, 1, 22),
 datetime.date(2023, 1, 23),
 datetime.date(2023, 9, 28),
 datetime.date(2023, 9, 29),
 datetime.date(2023, 9, 30),
 datetime.date(2023, 1, 1),
 datetime.date(2023, 3, 1),
 datetime.date(2023, 5, 5),
 datetime.date(2023, 5, 26),
 datetime.date(2023, 6, 6),
 datetime.date(2023, 8, 15),
 datetime.date(2023, 10, 9),
 datetime.date(2023, 10, 3),
 datetime.date(2023, 12, 25)]

In [27]:
# 판매 년도 추출
year_list = []
start_year = int(str(df['SalesMonth'][0])[0:4])
last_year = int(str(df['SalesMonth'].iloc[-1])[0:4])

while start_year <= last_year:
    year_list.append(start_year)
    start_year += 1

year_list

[2013, 2014]

In [28]:
# 해당 년도의 공휴일 추출
holidays = []
for y in year_list:
    kr_holidays = pytimekr.holidays(year = y)
    for i in range(len(kr_holidays)):
        holidays.append(kr_holidays[i])

holidays

[datetime.date(2013, 2, 9),
 datetime.date(2013, 2, 10),
 datetime.date(2013, 2, 11),
 datetime.date(2013, 9, 18),
 datetime.date(2013, 9, 19),
 datetime.date(2013, 9, 20),
 datetime.date(2013, 1, 1),
 datetime.date(2013, 3, 1),
 datetime.date(2013, 5, 5),
 datetime.date(2013, 5, 17),
 datetime.date(2013, 6, 6),
 datetime.date(2013, 8, 15),
 datetime.date(2013, 10, 9),
 datetime.date(2013, 10, 3),
 datetime.date(2013, 12, 25),
 datetime.date(2014, 1, 30),
 datetime.date(2014, 1, 31),
 datetime.date(2014, 2, 1),
 datetime.date(2014, 9, 7),
 datetime.date(2014, 9, 8),
 datetime.date(2014, 9, 9),
 datetime.date(2014, 1, 1),
 datetime.date(2014, 3, 1),
 datetime.date(2014, 5, 5),
 datetime.date(2014, 5, 6),
 datetime.date(2014, 6, 6),
 datetime.date(2014, 8, 15),
 datetime.date(2014, 10, 9),
 datetime.date(2014, 10, 3),
 datetime.date(2014, 12, 25)]

In [29]:
holidays = pd.DataFrame(holidays, columns = ['holidays'])
holidays['holidays'] = pd.to_datetime(holidays['holidays'])
holidays['hol2'] = holidays['holidays'].astype(str)
holidays['hol2'] = holidays['hol2'].str[0:7]
holidays = holidays.groupby('hol2').count()
holidays

Unnamed: 0_level_0,holidays
hol2,Unnamed: 1_level_1
2013-01,1
2013-02,3
2013-03,1
2013-05,2
2013-06,1
2013-08,1
2013-09,3
2013-10,2
2013-12,1
2014-01,3


In [30]:
df = df.set_index('SalesMonth')
df['hdays'] = holidays['holidays']
df['hdays'] = df['hdays'].fillna(0)
df = df.reset_index()
df

Unnamed: 0,SalesMonth,SalesQty,hdays
0,2013-05,222,2.0
1,2013-06,340,1.0
2,2013-07,404,0.0
3,2013-08,254,1.0
4,2013-09,286,3.0
5,2013-10,267,2.0
6,2013-11,155,0.0
7,2013-12,183,1.0
8,2014-01,259,3.0
9,2014-02,31,1.0


In [31]:
# train / test set split
train = df.loc[df['SalesMonth'] < ref_month]
test = df.loc[df['SalesMonth'] >= ref_month]
test = test[:pred_month]
train

Unnamed: 0,SalesMonth,SalesQty,hdays
0,2013-05,222,2.0
1,2013-06,340,1.0
2,2013-07,404,0.0
3,2013-08,254,1.0
4,2013-09,286,3.0
5,2013-10,267,2.0
6,2013-11,155,0.0
7,2013-12,183,1.0
8,2014-01,259,3.0
9,2014-02,31,1.0


In [32]:
test

Unnamed: 0,SalesMonth,SalesQty,hdays
12,2014-05,278,2.0
13,2014-06,24,1.0


In [33]:
y = train['SalesQty']
x = train['hdays']
forecastX = test['hdays']