In [15]:
import pandas as pd
import numpy as np

# 과제1. 시계열 데이터 문법 구글링해보기

#### pd.to_datetime
- 날짜/시간을 나타내는 문자열을 자동으로 datetime 자료형으로 바꾼 후 DatetimeIndex 자료형 인덱스를 생성

In [10]:
date = pd.Series(["2022, 1, 1", "2022/02/19", "2023.02.18", "19941010"])
pd.to_datetime(date)

0   2022-01-01
1   2022-02-19
2   2023-02-18
3   1994-10-10
dtype: datetime64[ns]

#### pd.date_range
- 모든 날짜 및 시간을 일일히 입력할 필요없이 시작일과 종료일 또는 시작일과 기간을 입력하면 범위 내의 인덱스를 생성

In [12]:
pd.date_range("2021/1/1", "2021.2.3")

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12',
               '2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16',
               '2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20',
               '2021-01-21', '2021-01-22', '2021-01-23', '2021-01-24',
               '2021-01-25', '2021-01-26', '2021-01-27', '2021-01-28',
               '2021-01-29', '2021-01-30', '2021-01-31', '2021-02-01',
               '2021-02-02', '2021-02-03'],
              dtype='datetime64[ns]', freq='D')

#### shift
- 시계열 데이터의 인덱스는 시간이나 날짜이므로 '날짜 이동' 등의 다양한 연산이 가능하다.
- 인덱스는 그대로 두고 데이터만 이동시킬 수 있다.

#### freq="M"
- 각 달의 마지막 날

In [16]:
np.random.seed(0)
ex1 = pd.Series(np.random.randn(5), index=pd.date_range(
    "2023-1-1", periods=5, freq="M"))
ex1

2023-01-31    1.764052
2023-02-28    0.400157
2023-03-31    0.978738
2023-04-30    2.240893
2023-05-31    1.867558
Freq: M, dtype: float64

In [18]:
ex1.shift(1)

2023-01-31         NaN
2023-02-28    1.764052
2023-03-31    0.400157
2023-04-30    0.978738
2023-05-31    2.240893
Freq: M, dtype: float64

In [17]:
ex1.shift(2)

2023-01-31         NaN
2023-02-28         NaN
2023-03-31    1.764052
2023-04-30    0.400157
2023-05-31    0.978738
Freq: M, dtype: float64

In [19]:
ex1.shift(-1)

2023-01-31    0.400157
2023-02-28    0.978738
2023-03-31    2.240893
2023-04-30    1.867558
2023-05-31         NaN
Freq: M, dtype: float64

#### dt 접근자
- datetime 자료형 시리즈에는 dt 접근자가 있다. 
- datetime 자료형이 가진 유용한 속성, 메서드를 사용 가능하다.

In [25]:
s = pd.Series(pd.date_range("2022-12-21", periods=20, freq="D"))
s

0    2022-12-21
1    2022-12-22
2    2022-12-23
3    2022-12-24
4    2022-12-25
5    2022-12-26
6    2022-12-27
7    2022-12-28
8    2022-12-29
9    2022-12-30
10   2022-12-31
11   2023-01-01
12   2023-01-02
13   2023-01-03
14   2023-01-04
15   2023-01-05
16   2023-01-06
17   2023-01-07
18   2023-01-08
19   2023-01-09
dtype: datetime64[ns]

In [26]:
s.dt.year

0     2022
1     2022
2     2022
3     2022
4     2022
5     2022
6     2022
7     2022
8     2022
9     2022
10    2022
11    2023
12    2023
13    2023
14    2023
15    2023
16    2023
17    2023
18    2023
19    2023
dtype: int64

In [27]:
s.dt.weekday

0     2
1     3
2     4
3     5
4     6
5     0
6     1
7     2
8     3
9     4
10    5
11    6
12    0
13    1
14    2
15    3
16    4
17    5
18    6
19    0
dtype: int64

#### strftime 
- 문자열 형태로 시계열 데이터를 반환

In [29]:
s.dt.strftime("%Y년 %m월 %d일")

0     2022년 12월 21일
1     2022년 12월 22일
2     2022년 12월 23일
3     2022년 12월 24일
4     2022년 12월 25일
5     2022년 12월 26일
6     2022년 12월 27일
7     2022년 12월 28일
8     2022년 12월 29일
9     2022년 12월 30일
10    2022년 12월 31일
11    2023년 01월 01일
12    2023년 01월 02일
13    2023년 01월 03일
14    2023년 01월 04일
15    2023년 01월 05일
16    2023년 01월 06일
17    2023년 01월 07일
18    2023년 01월 08일
19    2023년 01월 09일
dtype: object

# 과제2. 고객 장바구니 데이터로 시계열 데이터 분석하기

##### csv 파일을 불러올 때 인코딩 에러가 나면 아래 코드를 시도해보자.
- , encoding='cp949'
- , encoding='euc-kr'
- , encoding='latin_1'
- , encoding='utf-16'

In [41]:
sales = pd.read_csv('sales_data.csv', encoding='latin_1')
sales

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


In [47]:
sales['InvoiceDate'] = pd.to_datetime(sales['InvoiceDate'])

In [48]:
sales_df = sales.set_index('InvoiceDate')
sales_df

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-01 08:26:00,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2.55,17850.0,United Kingdom
2010-12-01 08:26:00,536365,71053,WHITE METAL LANTERN,6,3.39,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2.75,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,3.39,17850.0,United Kingdom
2010-12-01 08:26:00,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...
2011-12-09 12:50:00,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,0.85,12680.0,France
2011-12-09 12:50:00,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2.10,12680.0,France
2011-12-09 12:50:00,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,4.15,12680.0,France
2011-12-09 12:50:00,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,4.15,12680.0,France


In [59]:
# 더운 여름의 sales 데이터 분석
sales_df.loc['2011-07':'2011-08']

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-07-01 08:16:00,558638,84836,ZINC METAL HEART DECORATION,12,1.25,16317.0,United Kingdom
2011-07-01 08:16:00,558638,71459,HANGING JAM JAR T-LIGHT HOLDER,24,0.85,16317.0,United Kingdom
2011-07-01 08:16:00,558638,22784,LANTERN CREAM GAZEBO,3,4.95,16317.0,United Kingdom
2011-07-01 08:16:00,558638,23145,ZINC T-LIGHT HOLDER STAR LARGE,12,0.95,16317.0,United Kingdom
2011-07-01 08:16:00,558638,22674,FRENCH TOILET SIGN BLUE METAL,12,1.25,16317.0,United Kingdom
...,...,...,...,...,...,...,...
2011-08-31 17:31:00,C565077,23196,VINTAGE LEAF MAGNETIC NOTEPAD,-3,1.45,17451.0,United Kingdom
2011-08-31 17:31:00,C565077,22189,CREAM HEART CARD HOLDER,-1,3.95,17451.0,United Kingdom
2011-08-31 17:31:00,C565077,23239,SET OF 4 KNICK KNACK TINS POPPIES,-1,4.15,17451.0,United Kingdom
2011-08-31 17:31:00,C565077,23197,SKETCHBOOK MAGNETIC SHOPPING LIST,-12,1.45,17451.0,United Kingdom


In [57]:
# 퇴근하는 시간대 sales 데이터 분석
sales_df.between_time('18:00','20:00').sort_index()

Unnamed: 0_level_0,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-12-02 18:06:00,536835,22655,VINTAGE RED KITCHEN CABINET,1,295.00,13145.0,United Kingdom
2010-12-02 18:08:00,536836,21231,SWEETHEART CERAMIC TRINKET BOX,20,1.25,18168.0,United Kingdom
2010-12-02 18:08:00,536836,21232,STRAWBERRY CERAMIC TRINKET BOX,20,1.25,18168.0,United Kingdom
2010-12-02 18:08:00,536836,22060,LARGE CAKE STAND HANGING HEARTS,2,9.95,18168.0,United Kingdom
2010-12-02 18:08:00,536836,37449,CERAMIC CAKE STAND + HANGING CAKES,2,9.95,18168.0,United Kingdom
...,...,...,...,...,...,...,...
2011-12-08 19:57:00,581473,22108,PING! MICROWAVE PLATE,1,1.25,12748.0,United Kingdom
2011-12-08 19:57:00,581473,23295,SET OF 12 MINI LOAF BAKING CASES,1,0.83,12748.0,United Kingdom
2011-12-08 19:57:00,581473,23296,SET OF 6 TEA TIME BAKING CASES,1,1.25,12748.0,United Kingdom
2011-12-08 19:57:00,581473,20718,RED RETROSPOT SHOPPER BAG,1,1.25,12748.0,United Kingdom


In [64]:
# 주별 값을 계산
# 궁금한 부분: 2011-01-09에는 왜 값이 모두 0일까?
sales_df.resample('W').sum()

Unnamed: 0_level_0,Quantity,UnitPrice,CustomerID
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-12-05,79062,38727.19,122249393.0
2010-12-12,118448,144262.85,153871225.0
2010-12-19,105446,51191.06,111863691.0
2010-12-26,39272,26339.75,28713439.0
2011-01-02,0,0.0,0.0
2011-01-09,73491,60913.67,81041033.0
2011-01-16,85626,34727.42,73351916.0
2011-01-23,67969,38492.16,73526553.0
2011-01-30,69312,32429.42,87941705.0
2011-02-06,67613,28663.97,79426930.0
