# Pandas

참조: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("pandas ver={}".format(pd.__version__))
print("numpy ver={}".format(np.__version__))

# Series, DataFrame

* Series: 1차원 데이터(리스트 형태)를 표현하는 구조
* DataFrame: 2차원 데이터(테이블 형태)를 표현하는 구조

In [2]:
# Series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

# DataFrame by Numpy Array
index = pd.date_range('20130101', periods=6)
print(index)

columns = list('ABCD')
print(columns)

data = np.random.randn(6, 4)
print(data)

df = pd.DataFrame(data, index=index, columns=columns)
print(df)

# DataFrome by Dictonary
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df2)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
['A', 'B', 'C', 'D']
[[-0.24875324 -0.74637377 -1.8905423  -2.83093815]
 [-0.84161378  0.7467383  -1.42163788 -0.51535809]
 [-0.56275031 -1.98035779 -0.78772683  2.37222896]
 [-1.0211662  -0.91041359 -1.09541084  0.41318597]
 [ 0.9608215   0.51908166 -1.70831879 -1.15418108]
 [-0.42120775  0.29764625 -0.06090697 -0.27875479]]
                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186
2013-01-05  0.960822  0.519082 -1.708319 -1.154181
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 

## CSV Read/Write

In [3]:
# CSV
df = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.csv"
df.to_csv(path_csv, index=False)

df = pd.read_csv(path_csv, header=0)
print(df)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
     A           B    C  D      E    F
0  1.0  2013-01-02  1.0  3   test  foo
1  1.0  2013-01-02  1.0  3  train  foo
2  1.0  2013-01-02  1.0  3   test  foo
3  1.0  2013-01-02  1.0  3  train  foo


## JSON Read/Write 

In [4]:
df = pd.DataFrame([['a', 'b'], ['c', 'd']], index=['row 1', 'row 2'], columns=['col 1', 'col 2'])
print(df)

path_json = "/tmp/foo.json"
# '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
#df.to_json(path_json, orient='index')
# '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
df.to_json(path_json, orient='records')

#df = pd.read_json(path_json, orient='index')
df = pd.read_json(path_json, orient='records')
print(df)

      col 1 col 2
row 1     a     b
row 2     c     d
  col 1 col 2
0     a     b
1     c     d


# DataFrame 정보보기

In [5]:
df = pd.DataFrame(data, index=index, columns=columns)
print(df)

# DataFrame 컬럼 정보
print(df2.dtypes)
print(df2.info())

                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186
2013-01-05  0.960822  0.519082 -1.708319 -1.154181
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
A    4 non-null float64
B    4 non-null datetime64[ns]
C    4 non-null float32
D    4 non-null int32
E    4 non-null category
F    4 non-null object
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 260.0+ bytes
None


In [6]:
# 데이터 살펴보기
print(df.head())
print(df.tail(3))

# DataFrame 요소 살펴보기
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))

                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186
2013-01-05  0.960822  0.519082 -1.708319 -1.154181
                   A         B         C         D
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186
2013-01-05  0.960822  0.519082 -1.708319 -1.154181
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[-0.24875324 -0.74637377 -1.8905423  -2.83093815]
 [-0.84161378  0.7467383  -1.42163788 -0.51535809]
 [-0.56275031 -1.98035779 -0.78772683  2.37222896]
 [-1.0211662  -0.91041359 -1.09541084  0.41318597]
 [ 0.9608215   0.51908166 -1.70831879 -1.15418108]
 [-0.42120775  0.29764625 -0

# DataFrame 조작하기

## Selection

In [7]:
# 행 슬라이싱
print(df[0:2])

# 열 선택
print(df['A'].head())

# 열 다중선택
print(df[['A', 'B']].head())

                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-01   -0.248753
2013-01-02   -0.841614
2013-01-03   -0.562750
2013-01-04   -1.021166
2013-01-05    0.960822
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -0.248753 -0.746374
2013-01-02 -0.841614  0.746738
2013-01-03 -0.562750 -1.980358
2013-01-04 -1.021166 -0.910414
2013-01-05  0.960822  0.519082


In [8]:
# Label 을 이용한 선택 .loc()

# 셀 선택
print(df.loc['2013-01-01', 'A'])

# 행 선택
print(df.loc['2013-01-01'])

# 행 슬라이싱
print(df.loc['2013-01-01':'2013-01-03'])

# 열 선택
print(df.loc[:, 'A'].head())

# 열 다중 선택
print(df.loc[:, ['A', 'B']].head())

# 행, 열 다중 선택
print(df.loc['2013-01-01':'2013-01-03', ['A', 'B']])

-0.248753240478704
A   -0.248753
B   -0.746374
C   -1.890542
D   -2.830938
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229
2013-01-01   -0.248753
2013-01-02   -0.841614
2013-01-03   -0.562750
2013-01-04   -1.021166
2013-01-05    0.960822
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -0.248753 -0.746374
2013-01-02 -0.841614  0.746738
2013-01-03 -0.562750 -1.980358
2013-01-04 -1.021166 -0.910414
2013-01-05  0.960822  0.519082
                   A         B
2013-01-01 -0.248753 -0.746374
2013-01-02 -0.841614  0.746738
2013-01-03 -0.562750 -1.980358


In [9]:
# Position을 이용한 선택 .iloc()

# 셀 선택
print(df.iloc[0, 0])

# 행 선택
print(df.iloc[0])

# 행 슬라이싱
print(df.iloc[0:3])

# 열 선택
print(df.iloc[:, 0].head())

# 열 다중 선택
print(df.iloc[:, 0:2].head())

# 행, 열 다중 선택
print(df.iloc[0:3, 0:2])

print(df.iloc[[1, 3], [0, 1]])

-0.248753240478704
A   -0.248753
B   -0.746374
C   -1.890542
D   -2.830938
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229
2013-01-01   -0.248753
2013-01-02   -0.841614
2013-01-03   -0.562750
2013-01-04   -1.021166
2013-01-05    0.960822
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -0.248753 -0.746374
2013-01-02 -0.841614  0.746738
2013-01-03 -0.562750 -1.980358
2013-01-04 -1.021166 -0.910414
2013-01-05  0.960822  0.519082
                   A         B
2013-01-01 -0.248753 -0.746374
2013-01-02 -0.841614  0.746738
2013-01-03 -0.562750 -1.980358
                   A         B
2013-01-02 -0.841614  0.746738
2013-01-04 -1.021166 -0.910414


In [10]:
# 조건을 이용한 선택

# 열 단위
print(df[df['A'] > 0])

# DataFrame 단위
print(df[df > 0])

# Boolean 리스트를 이용
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
index = df2['E'].isin(['two','four'])
print(index)
print(df2[df2['E'].isin(['two','four'])])

                   A         B         C         D
2013-01-05  0.960822  0.519082 -1.708319 -1.154181
                   A         B   C         D
2013-01-01       NaN       NaN NaN       NaN
2013-01-02       NaN  0.746738 NaN       NaN
2013-01-03       NaN       NaN NaN  2.372229
2013-01-04       NaN       NaN NaN  0.413186
2013-01-05  0.960822  0.519082 NaN       NaN
2013-01-06       NaN  0.297646 NaN       NaN
2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool
                   A         B         C         D     E
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229   two
2013-01-05  0.960822  0.519082 -1.708319 -1.154181  four


## Setting

In [11]:
# 값 설정
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
df['F'] = s1
df['G'] = s1
print(df)

# 컬럼 삭제 
df = df.drop(['G'], axis=1)
print(df)

# Label에 의한 값 설정
df.at['2013-01-02', 'F'] = 0


# Position에 의한 값 설정
df.iat[2,-1] = 1

print(df)

# 배열을 이용한 값 설정
df.loc[1:, 'F'] = np.array([5] * (len(df) - 1))
print(df)

# 조건을 통한 값 설정
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)

                   A         B         C         D    F    G
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  NaN  NaN
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  1.0  1.0
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229  2.0  2.0
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186  3.0  3.0
2013-01-05  0.960822  0.519082 -1.708319 -1.154181  4.0  4.0
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755  5.0  5.0
                   A         B         C         D    F
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  NaN
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  1.0
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229  2.0
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186  3.0
2013-01-05  0.960822  0.519082 -1.708319 -1.154181  4.0
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755  5.0
                   A         B         C         D    F
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  NaN
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  0.0
2013-01-03 -0

## Missing Value

In [12]:
index = pd.date_range('20130101', periods=6)
df1 = df.reindex(index=index[0:4], columns=list(df.columns) + ['E'])
df1.loc[index[0]:index[1],'E'] = 1
print(df1)

# 결측치 제거하기
print(df1.dropna(how='any'))

# 결측치 채워넣기
print(df1.fillna(value=5))

# 결측치 위치보기
print(df1.isnull())

                   A         B         C         D    F    E
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  NaN  1.0
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  5.0  1.0
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229  5.0  NaN
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186  5.0  NaN
                   A         B         C         D    F    E
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  5.0  1.0
                   A         B         C         D    F    E
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  5.0  1.0
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  5.0  1.0
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229  5.0  5.0
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186  5.0  5.0
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True


## Operation

In [13]:
# DataFrame 통계확인
print(df.describe())

# 열 별 통계
print(df.mean())

# 행 별 통계
print(df.mean(1))

# DataFrame 연산
s = pd.Series([1,3,5,np.nan,6,8], index=index).shift(2)
print(s)
print(df.sub(s, axis='index'))

              A         B         C         D    F
count  6.000000  6.000000  6.000000  6.000000  5.0
mean  -0.355778 -0.345613 -1.160757 -0.332303  5.0
std    0.702963  1.049448  0.671351  1.721934  0.0
min   -1.021166 -1.980358 -1.890542 -2.830938  5.0
25%   -0.771898 -0.869404 -1.636649 -0.994475  5.0
50%   -0.491979 -0.224364 -1.258524 -0.397056  5.0
75%   -0.291867  0.463723 -0.864648  0.240201  5.0
max    0.960822  0.746738 -0.060907  2.372229  5.0
A   -0.355778
B   -0.345613
C   -1.160757
D   -0.332303
F    5.000000
dtype: float64
2013-01-01   -1.429152
2013-01-02    0.593626
2013-01-03    0.808279
2013-01-04    0.477239
2013-01-05    0.723481
2013-01-06    0.907355
Freq: D, dtype: float64
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
                   A         B         C         D    F
2013-01-01       NaN       NaN       NaN       NaN  NaN
2013-01-02       NaN       NaN       NaN       NaN

In [14]:
# DataFrame Transpose(전치, 열과행치환)
print(df.T)

   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -0.248753   -0.841614   -0.562750   -1.021166    0.960822   -0.421208
B   -0.746374    0.746738   -1.980358   -0.910414    0.519082    0.297646
C   -1.890542   -1.421638   -0.787727   -1.095411   -1.708319   -0.060907
D   -2.830938   -0.515358    2.372229    0.413186   -1.154181   -0.278755
F         NaN    5.000000    5.000000    5.000000    5.000000    5.000000


In [15]:
# 축별로 정렬
print(df.sort_index(axis=1, ascending=False))

# 값별로 정렬
print(df.sort_values(by='B'))

              F         D         C         B         A
2013-01-01  NaN -2.830938 -1.890542 -0.746374 -0.248753
2013-01-02  5.0 -0.515358 -1.421638  0.746738 -0.841614
2013-01-03  5.0  2.372229 -0.787727 -1.980358 -0.562750
2013-01-04  5.0  0.413186 -1.095411 -0.910414 -1.021166
2013-01-05  5.0 -1.154181 -1.708319  0.519082  0.960822
2013-01-06  5.0 -0.278755 -0.060907  0.297646 -0.421208
                   A         B         C         D    F
2013-01-03 -0.562750 -1.980358 -0.787727  2.372229  5.0
2013-01-04 -1.021166 -0.910414 -1.095411  0.413186  5.0
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938  NaN
2013-01-06 -0.421208  0.297646 -0.060907 -0.278755  5.0
2013-01-05  0.960822  0.519082 -1.708319 -1.154181  5.0
2013-01-02 -0.841614  0.746738 -1.421638 -0.515358  5.0


In [16]:
# apply: 데이터에 함수 적용

# 열별 적용
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))

# 행별 적용
print(df.apply(np.cumsum, axis=1))
print(df.apply(lambda x: x.max() - x.min(), axis=1))

                   A         B         C         D     F
2013-01-01 -0.248753 -0.746374 -1.890542 -2.830938   NaN
2013-01-02 -1.090367  0.000365 -3.312180 -3.346296   5.0
2013-01-03 -1.653117 -1.979993 -4.099907 -0.974067  10.0
2013-01-04 -2.674284 -2.890407 -5.195318 -0.560881  15.0
2013-01-05 -1.713462 -2.371325 -6.903637 -1.715062  20.0
2013-01-06 -2.134670 -2.073679 -6.964544 -1.993817  25.0
A    1.981988
B    2.727096
C    1.829635
D    5.203167
F    0.000000
dtype: float64
                   A         B         C         D         F
2013-01-01 -0.248753 -0.995127 -2.885669 -5.716607       NaN
2013-01-02 -0.841614 -0.094875 -1.516513 -2.031871  2.968129
2013-01-03 -0.562750 -2.543108 -3.330835 -0.958606  4.041394
2013-01-04 -1.021166 -1.931580 -3.026991 -2.613805  2.386195
2013-01-05  0.960822  1.479903 -0.228416 -1.382597  3.617403
2013-01-06 -0.421208 -0.123562 -0.184468 -0.463223  4.536777
2013-01-01    2.582185
2013-01-02    6.421638
2013-01-03    6.980358
2013-01-04    6.0954

In [17]:
# Histogramming: 데이터 값 세기
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())

0    6
1    6
2    6
3    1
4    1
5    6
6    6
7    4
8    6
9    4
dtype: int64
6    6
4    2
1    2
dtype: int64


In [18]:
# 문자열 함수
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object


## Merge

상세 참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [19]:
# Concat: 단순히 두 DataFrame을 연결만 하기 때문에 의도한 결과와 같은지 확인이 필요합니다.
df = pd.DataFrame(np.random.randn(10, 3))
a = df[:2]
b = df[3:5]
print(a)
print(b)

# 행 기반 연결
print(pd.concat([a, b]))
print(pd.concat([a, b], ignore_index=True))

# 열 기반 연결
b.columns = [3, 4, 5]
b.reset_index(inplace=True, drop=True)
print(pd.concat([a, b], axis=1))

          0         1         2
0 -0.825634 -1.295592 -2.054984
1  0.798575 -1.324423 -0.050040
          0         1         2
3 -1.688734 -0.229391 -0.468013
4 -1.310736  0.913415  1.382516
          0         1         2
0 -0.825634 -1.295592 -2.054984
1  0.798575 -1.324423 -0.050040
3 -1.688734 -0.229391 -0.468013
4 -1.310736  0.913415  1.382516
          0         1         2
0 -0.825634 -1.295592 -2.054984
1  0.798575 -1.324423 -0.050040
2 -1.688734 -0.229391 -0.468013
3 -1.310736  0.913415  1.382516
          0         1         2         3         4         5
0 -0.825634 -1.295592 -2.054984 -1.688734 -0.229391 -0.468013
1  0.798575 -1.324423 -0.050040 -1.310736  0.913415  1.382516


In [20]:
# Append: 행 기반 concat 과 같습니다.
df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
print(df)

s = df.iloc[1]
print(df.append(s, ignore_index=True))

          A         B         C
0  0.081917 -1.950212  2.411885
1  0.105589  0.168567 -0.053157
2 -0.784855 -0.088158  0.295081
          A         B         C
0  0.081917 -1.950212  2.411885
1  0.105589  0.168567 -0.053157
2 -0.784855 -0.088158  0.295081
3  0.105589  0.168567 -0.053157


In [21]:
# Join: SQL 에서 지원하는 방식과 유사
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on= 'key'))

   key  lval
0  foo     1
1  bar     2
   key  rval
0  foo     4
1  bar     5
   key  lval  rval
0  foo     1     4
1  bar     2     5


In [22]:
# GroupBy: SQL 에서 지원되는 방식과 유사
df = pd.DataFrame({
  'A' : ['foo', 'bar', 'foo', 'bar'],
  'B' : ['one', 'one', 'two', 'three'],
  'C' : np.random.randn(4),
  'D' : np.random.randn(4)
})
print(df)

print(df.groupby('A').sum())
print(df.groupby(['A','B']).sum())

     A      B         C         D
0  foo    one -1.273316 -0.198317
1  bar    one  2.053831  1.109291
2  foo    two  0.480657 -0.367713
3  bar  three -1.263757 -1.559587
            C         D
A                      
bar  0.790074 -0.450296
foo -0.792659 -0.566030
                  C         D
A   B                        
bar one    2.053831  1.109291
    three -1.263757 -1.559587
foo one   -1.273316 -0.198317
    two    0.480657 -0.367713


## Reshaping

In [23]:
# Stack
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
s = pd.Series(np.arange(1.0, 5.0), index=index)
print(s)
df = s.unstack()
print(df)
print(df.stack())

# Pivot
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))

one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
       a    b
one  1.0  2.0
two  3.0  4.0
one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
        A  B    C         D         E
0     one  A  foo -0.675779 -0.063768
1     one  B  foo -1.013232 -0.714525
2     two  C  foo  0.375405 -1.900144
3   three  A  bar -1.745550 -0.627609
4     one  B  bar -0.782500 -1.501190
5     one  C  bar -0.428608  0.362804
6     two  A  foo -0.260621  0.178342
7   three  B  foo -0.410908  0.884395
8     one  C  foo -0.655849 -0.633740
9     one  A  bar  1.275894 -1.099731
10    two  B  bar -1.441008 -0.370465
11  three  C  bar  0.028793 -0.169769
C             bar       foo
A     B                    
one   A  1.275894 -0.675779
      B -0.782500 -1.013232
      C -0.428608 -0.655849
three A -1.745550       NaN
      B       NaN -0.410908
      C  0.028793       NaN
two   A       NaN -0.260621
      B -1.441008       NaN
      C       NaN  0.375405


## Time Series

Time Series 데이터를 다룰 때 필요한 편리한 기능을 제공합니다.

상세참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [24]:
import datetime

# 다양한 방법으로 DatetimeIndex 생성
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])
print(dti)

dti = pd.date_range('2018-01-01', periods=2, freq='H')
print(dti)

# DateTimeIndex 로 변환
print(pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])))
print(pd.to_datetime(['2005/11/23', '2010.12.31']))
print(pd.to_datetime([1349720105, 1349806505], unit='s')) # Epoch Timestamp Sec
print(pd.to_datetime([1349720105100, 1349720105200], unit='ms')) # Epoch Timestamp Millisec

# DateTimeIndex 로 변환시 에러 처리
try:
    pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
except Exception as e:
    print("ERROR: {}".format(e))
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')
  
# Timezone 설정
dti = dti.tz_localize('UTC')
print(dti)

# Timezone 변경
dti = dti.tz_convert('Asia/Seoul')
print(dti)

DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00'], dtype='datetime64[ns]', freq='H')
0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]
DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05.100000', '2012-10-08 18:15:05.200000'], dtype='datetime64[ns]', freq=None)
ERROR: ('Unknown string format:', 'asd')
DatetimeIndex(['2018-01-01 00:00:00+00:00', '2018-01-01 01:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='H')
DatetimeIndex(['2018-01-01 09:00:00+09:00', '2018-01-01 10:00:00+09:00'], dtype='datetime64[ns, Asia/Seoul]', freq='H')


In [25]:
# 시간단위 샘플링
idx = pd.date_range('2018-01-01', periods=5, freq='D')
ts = pd.Series(range(len(idx)), index=idx)
print(ts)
print(ts.resample('8H').mean())
print(ts.resample('2D').mean())

2018-01-01    0
2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
Freq: D, dtype: int64
2018-01-01 00:00:00    0.0
2018-01-01 08:00:00    NaN
2018-01-01 16:00:00    NaN
2018-01-02 00:00:00    1.0
2018-01-02 08:00:00    NaN
2018-01-02 16:00:00    NaN
2018-01-03 00:00:00    2.0
2018-01-03 08:00:00    NaN
2018-01-03 16:00:00    NaN
2018-01-04 00:00:00    3.0
2018-01-04 08:00:00    NaN
2018-01-04 16:00:00    NaN
2018-01-05 00:00:00    4.0
Freq: 8H, dtype: float64
2018-01-01    0.5
2018-01-03    2.5
2018-01-05    4.0
Freq: 2D, dtype: float64


In [26]:
## Version 0.24.1 이상
print(pd.__version__)
if pd.__version__ >= "0.24.1":
    # 시간 계산
    friday = pd.Timestamp('2018-01-05')
    print(friday.day_name())

    # Add 1 day
    saturday = friday + pd.Timedelta('1 day')
    print(saturday.day_name())

    # Add 1 business day (Friday --> Monday)
    monday = friday + pd.offsets.BDay()
    print(monday.day_name())

0.24.1
Friday
Saturday
Monday
