# Pandas

참조: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

print("pandas ver={}".format(pd.__version__))
print("numpy ver={}".format(np.__version__))

pandas ver=0.24.1
numpy ver=1.16.1


# Series, DataFrame

* Series: 1차원 데이터(리스트 형태)를 표현하는 구조
* DataFrame: 2차원 데이터(테이블 형태)를 표현하는 구조

In [2]:
# Series
s = pd.Series([1,3,5,np.nan,6,8])
print(s)

# DataFrame by Numpy Array
index = pd.date_range('20130101', periods=6)
print(index)

columns = list('ABCD')
print(columns)

data = np.random.randn(6, 4)
print(data)

df = pd.DataFrame(data, index=index, columns=columns)
print(df)

# DataFrome by Dictonary
df2 = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df2)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
['A', 'B', 'C', 'D']
[[-1.37416915 -2.27666159 -1.92710789  0.39225219]
 [-1.43430714 -1.17877381 -0.43712799 -0.15658588]
 [-0.14737309 -0.07075856  0.74947336  0.00773585]
 [-0.32495741 -1.10050982 -0.42845456 -0.37720834]
 [ 0.35053715 -1.09262979  1.52373942 -0.07198803]
 [ 0.05565495  0.28766312 -0.9130464   0.87188832]]
                   A         B         C         D
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586
2013-01-03 -0.147373 -0.070759  0.749473  0.007736
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208
2013-01-05  0.350537 -1.092630  1.523739 -0.071988
2013-01-06  0.055655  0.287663 -0.913046  0.871888
     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 

# DataFrame 살펴보기

In [3]:
# # DataFrame 컬럼 정보
print(df2.dtypes)
print(df2.info())

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 6 columns):
A    4 non-null float64
B    4 non-null datetime64[ns]
C    4 non-null float32
D    4 non-null int32
E    4 non-null category
F    4 non-null object
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int32(1), object(1)
memory usage: 260.0+ bytes
None


In [4]:
# 데이터 살펴보기
print(df.head())
print(df.tail(3))

# DataFrame 요소 살펴보기
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))

                   A         B         C         D
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586
2013-01-03 -0.147373 -0.070759  0.749473  0.007736
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208
2013-01-05  0.350537 -1.092630  1.523739 -0.071988
                   A         B         C         D
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208
2013-01-05  0.350537 -1.092630  1.523739 -0.071988
2013-01-06  0.055655  0.287663 -0.913046  0.871888
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
Index(['A', 'B', 'C', 'D'], dtype='object')
[[-1.37416915 -2.27666159 -1.92710789  0.39225219]
 [-1.43430714 -1.17877381 -0.43712799 -0.15658588]
 [-0.14737309 -0.07075856  0.74947336  0.00773585]
 [-0.32495741 -1.10050982 -0.42845456 -0.37720834]
 [ 0.35053715 -1.09262979  1.52373942 -0.07198803]
 [ 0.05565495  0.28766312 -0

# DataFrame 조작하기

## Selection

In [5]:
# 행 슬라이싱
print(df[0:2])

# 열 선택
print(df['A'].head())

# 열 다중선택
print(df[['A', 'B']].head())

                   A         B         C         D
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586
2013-01-01   -1.374169
2013-01-02   -1.434307
2013-01-03   -0.147373
2013-01-04   -0.324957
2013-01-05    0.350537
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -1.374169 -2.276662
2013-01-02 -1.434307 -1.178774
2013-01-03 -0.147373 -0.070759
2013-01-04 -0.324957 -1.100510
2013-01-05  0.350537 -1.092630


In [6]:
# Label 을 이용한 선택 .loc()

# 셀 선택
print(df.loc['2013-01-01', 'A'])

# 행 선택
print(df.loc['2013-01-01'])

# 행 슬라이싱
print(df.loc['2013-01-01':'2013-01-03'])

# 열 선택
print(df.loc[:, 'A'].head())

# 열 다중 선택
print(df.loc[:, ['A', 'B']].head())

# 행, 열 다중 선택
print(df.loc['2013-01-01':'2013-01-03', ['A', 'B']])

-1.3741691506673388
A   -1.374169
B   -2.276662
C   -1.927108
D    0.392252
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586
2013-01-03 -0.147373 -0.070759  0.749473  0.007736
2013-01-01   -1.374169
2013-01-02   -1.434307
2013-01-03   -0.147373
2013-01-04   -0.324957
2013-01-05    0.350537
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -1.374169 -2.276662
2013-01-02 -1.434307 -1.178774
2013-01-03 -0.147373 -0.070759
2013-01-04 -0.324957 -1.100510
2013-01-05  0.350537 -1.092630
                   A         B
2013-01-01 -1.374169 -2.276662
2013-01-02 -1.434307 -1.178774
2013-01-03 -0.147373 -0.070759


In [7]:
# Position을 이용한 선택 .iloc()

# 셀 선택
print(df.iloc[0, 0])

# 행 선택
print(df.iloc[0])

# 행 슬라이싱
print(df.iloc[0:3])

# 열 선택
print(df.iloc[:, 0].head())

# 열 다중 선택
print(df.iloc[:, 0:2].head())

# 행, 열 다중 선택
print(df.iloc[0:3, 0:2])

print(df.iloc[[1, 3], [0, 1]])

-1.3741691506673388
A   -1.374169
B   -2.276662
C   -1.927108
D    0.392252
Name: 2013-01-01 00:00:00, dtype: float64
                   A         B         C         D
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586
2013-01-03 -0.147373 -0.070759  0.749473  0.007736
2013-01-01   -1.374169
2013-01-02   -1.434307
2013-01-03   -0.147373
2013-01-04   -0.324957
2013-01-05    0.350537
Freq: D, Name: A, dtype: float64
                   A         B
2013-01-01 -1.374169 -2.276662
2013-01-02 -1.434307 -1.178774
2013-01-03 -0.147373 -0.070759
2013-01-04 -0.324957 -1.100510
2013-01-05  0.350537 -1.092630
                   A         B
2013-01-01 -1.374169 -2.276662
2013-01-02 -1.434307 -1.178774
2013-01-03 -0.147373 -0.070759
                   A         B
2013-01-02 -1.434307 -1.178774
2013-01-04 -0.324957 -1.100510


In [8]:
# 조건을 이용한 선택

# 열 단위
print(df[df['A'] > 0])

# DataFrame 단위
print(df[df > 0])

# Boolean 리스트를 이용
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
index = df2['E'].isin(['two','four'])
print(index)
print(df2[df2['E'].isin(['two','four'])])

                   A         B         C         D
2013-01-05  0.350537 -1.092630  1.523739 -0.071988
2013-01-06  0.055655  0.287663 -0.913046  0.871888
                   A         B         C         D
2013-01-01       NaN       NaN       NaN  0.392252
2013-01-02       NaN       NaN       NaN       NaN
2013-01-03       NaN       NaN  0.749473  0.007736
2013-01-04       NaN       NaN       NaN       NaN
2013-01-05  0.350537       NaN  1.523739       NaN
2013-01-06  0.055655  0.287663       NaN  0.871888
2013-01-01    False
2013-01-02    False
2013-01-03     True
2013-01-04    False
2013-01-05     True
2013-01-06    False
Freq: D, Name: E, dtype: bool
                   A         B         C         D     E
2013-01-03 -0.147373 -0.070759  0.749473  0.007736   two
2013-01-05  0.350537 -1.092630  1.523739 -0.071988  four


## Setting

In [9]:
# 값 설정
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))
df['F'] = s1
print(df)

# Label에 의한 값 설정
df.at['2013-01-02', 'F'] = 0

# Position에 의한 값 설정
df.iat[2,-1] = 1

print(df)

# 배열을 이용한 값 설정
df.loc[1:, 'F'] = np.array([5] * (len(df) - 1))
print(df)

# 조건을 통한 값 설정
df2 = df.copy()
df2[df2 > 0] = -df2
print(df2)

                   A         B         C         D    F
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  NaN
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  1.0
2013-01-03 -0.147373 -0.070759  0.749473  0.007736  2.0
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208  3.0
2013-01-05  0.350537 -1.092630  1.523739 -0.071988  4.0
2013-01-06  0.055655  0.287663 -0.913046  0.871888  5.0
                   A         B         C         D    F
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  NaN
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  0.0
2013-01-03 -0.147373 -0.070759  0.749473  0.007736  1.0
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208  3.0
2013-01-05  0.350537 -1.092630  1.523739 -0.071988  4.0
2013-01-06  0.055655  0.287663 -0.913046  0.871888  5.0
                   A         B         C         D    F
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  NaN
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  5.0
2013-01-03 -0.147373 -0.070759  0.749473  0.0077

## Missing Value

In [10]:
index = pd.date_range('20130101', periods=6)
df1 = df.reindex(index=index[0:4], columns=list(df.columns) + ['E'])
df1.loc[index[0]:index[1],'E'] = 1
print(df1)

# 결측치 제거하기
print(df1.dropna(how='any'))

# 결측치 채워넣기
print(df1.fillna(value=5))

# 결측치 위치보기
print(df1.isnull())

                   A         B         C         D    F    E
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  NaN  1.0
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  5.0  1.0
2013-01-03 -0.147373 -0.070759  0.749473  0.007736  5.0  NaN
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208  5.0  NaN
                   A         B         C         D    F    E
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  5.0  1.0
                   A         B         C         D    F    E
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  5.0  1.0
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  5.0  1.0
2013-01-03 -0.147373 -0.070759  0.749473  0.007736  5.0  5.0
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208  5.0  5.0
                A      B      C      D      F      E
2013-01-01  False  False  False  False   True  False
2013-01-02  False  False  False  False  False  False
2013-01-03  False  False  False  False  False   True
2013-01-04  False  False  False  False  False   True


## Operation

In [11]:
# DataFrame 통계확인
print(df.describe())

# 열 별 통계
print(df.mean())

# 행 별 통계
print(df.mean(1))

# DataFrame 연산
s = pd.Series([1,3,5,np.nan,6,8], index=index).shift(2)
print(s)
print(df.sub(s, axis='index'))

              A         B         C         D    F
count  6.000000  6.000000  6.000000  6.000000  5.0
mean  -0.479102 -0.905278 -0.238754  0.111016  5.0
std    0.751217  0.910850  1.221763  0.449923  0.0
min   -1.434307 -2.276662 -1.927108 -0.377208  5.0
25%   -1.111866 -1.159208 -0.794067 -0.135436  5.0
50%   -0.236165 -1.096570 -0.432791 -0.032126  5.0
75%    0.004898 -0.326226  0.454991  0.296123  5.0
max    0.350537  0.287663  1.523739  0.871888  5.0
A   -0.479102
B   -0.905278
C   -0.238754
D    0.111016
F    5.000000
dtype: float64
2013-01-01   -1.296422
2013-01-02    0.358641
2013-01-03    1.107816
2013-01-04    0.553774
2013-01-05    1.141932
2013-01-06    1.060432
Freq: D, dtype: float64
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64
                   A         B         C         D    F
2013-01-01       NaN       NaN       NaN       NaN  NaN
2013-01-02       NaN       NaN       NaN       NaN

In [12]:
# DataFrame Transpose(전치, 열과행치환)
print(df.T)

   2013-01-01  2013-01-02  2013-01-03  2013-01-04  2013-01-05  2013-01-06
A   -1.374169   -1.434307   -0.147373   -0.324957    0.350537    0.055655
B   -2.276662   -1.178774   -0.070759   -1.100510   -1.092630    0.287663
C   -1.927108   -0.437128    0.749473   -0.428455    1.523739   -0.913046
D    0.392252   -0.156586    0.007736   -0.377208   -0.071988    0.871888
F         NaN    5.000000    5.000000    5.000000    5.000000    5.000000


In [13]:
# 축별로 정렬
print(df.sort_index(axis=1, ascending=False))

# 값별로 정렬
print(df.sort_values(by='B'))

              F         D         C         B         A
2013-01-01  NaN  0.392252 -1.927108 -2.276662 -1.374169
2013-01-02  5.0 -0.156586 -0.437128 -1.178774 -1.434307
2013-01-03  5.0  0.007736  0.749473 -0.070759 -0.147373
2013-01-04  5.0 -0.377208 -0.428455 -1.100510 -0.324957
2013-01-05  5.0 -0.071988  1.523739 -1.092630  0.350537
2013-01-06  5.0  0.871888 -0.913046  0.287663  0.055655
                   A         B         C         D    F
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252  NaN
2013-01-02 -1.434307 -1.178774 -0.437128 -0.156586  5.0
2013-01-04 -0.324957 -1.100510 -0.428455 -0.377208  5.0
2013-01-05  0.350537 -1.092630  1.523739 -0.071988  5.0
2013-01-03 -0.147373 -0.070759  0.749473  0.007736  5.0
2013-01-06  0.055655  0.287663 -0.913046  0.871888  5.0


In [14]:
# apply: 데이터에 함수 적용

# 열별 적용
print(df.apply(np.cumsum))
print(df.apply(lambda x: x.max() - x.min()))

# 행별 적용
print(df.apply(np.cumsum, axis=1))
print(df.apply(lambda x: x.max() - x.min(), axis=1))

                   A         B         C         D     F
2013-01-01 -1.374169 -2.276662 -1.927108  0.392252   NaN
2013-01-02 -2.808476 -3.455435 -2.364236  0.235666   5.0
2013-01-03 -2.955849 -3.526194 -1.614763  0.243402  10.0
2013-01-04 -3.280807 -4.626704 -2.043217 -0.133806  15.0
2013-01-05 -2.930270 -5.719334 -0.519478 -0.205794  20.0
2013-01-06 -2.874615 -5.431670 -1.432524  0.666094  25.0
A    1.784844
B    2.564325
C    3.450847
D    1.249097
F    0.000000
dtype: float64
                   A         B         C         D         F
2013-01-01 -1.374169 -3.650831 -5.577939 -5.185686       NaN
2013-01-02 -1.434307 -2.613081 -3.050209 -3.206795  1.793205
2013-01-03 -0.147373 -0.218132  0.531342  0.539078  5.539078
2013-01-04 -0.324957 -1.425467 -1.853922 -2.231130  2.768870
2013-01-05  0.350537 -0.742093  0.781647  0.709659  5.709659
2013-01-06  0.055655  0.343318 -0.569728  0.302160  5.302160
2013-01-01    2.668914
2013-01-02    6.434307
2013-01-03    5.147373
2013-01-04    6.1005

In [15]:
# Histogramming: 데이터 값 세기
s = pd.Series(np.random.randint(0, 7, size=10))
print(s)
print(s.value_counts())

0    5
1    4
2    4
3    1
4    5
5    2
6    3
7    3
8    4
9    2
dtype: int64
4    3
5    2
3    2
2    2
1    1
dtype: int64


In [16]:
# 문자열 함수
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
print(s.str.lower())

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object


## Merge

상세 참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [17]:
# Concat: 단순히 두 DataFrame을 연결만 하기 때문에 의도한 결과와 같은지 확인이 필요합니다.
df = pd.DataFrame(np.random.randn(10, 3))
a = df[:2]
b = df[3:5]
print(a)
print(b)

# 행 기반 연결
print(pd.concat([a, b]))
print(pd.concat([a, b], ignore_index=True))

# 열 기반 연결
b.columns = [3, 4, 5]
b.reset_index(inplace=True, drop=True)
print(pd.concat([a, b], axis=1))

          0         1         2
0  0.745094 -0.290568 -0.027353
1  0.746181 -1.307533  0.182484
          0         1         2
3  0.130199 -0.219391  0.132051
4  0.240965 -0.313058  0.663209
          0         1         2
0  0.745094 -0.290568 -0.027353
1  0.746181 -1.307533  0.182484
3  0.130199 -0.219391  0.132051
4  0.240965 -0.313058  0.663209
          0         1         2
0  0.745094 -0.290568 -0.027353
1  0.746181 -1.307533  0.182484
2  0.130199 -0.219391  0.132051
3  0.240965 -0.313058  0.663209
          0         1         2         3         4         5
0  0.745094 -0.290568 -0.027353  0.130199 -0.219391  0.132051
1  0.746181 -1.307533  0.182484  0.240965 -0.313058  0.663209


In [18]:
# Append: 행 기반 concat 과 같습니다.
df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
print(df)

s = df.iloc[1]
print(df.append(s, ignore_index=True))

          A         B         C
0 -0.210185 -0.717555 -1.561112
1  2.265995 -1.367731  0.442846
2  0.380776 -1.993455  1.143662
          A         B         C
0 -0.210185 -0.717555 -1.561112
1  2.265995 -1.367731  0.442846
2  0.380776 -1.993455  1.143662
3  2.265995 -1.367731  0.442846


In [19]:
# Join: SQL 에서 지원하는 방식과 유사
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
print(left)
print(right)
print(pd.merge(left, right, on= 'key'))

   key  lval
0  foo     1
1  bar     2
   key  rval
0  foo     4
1  bar     5
   key  lval  rval
0  foo     1     4
1  bar     2     5


In [20]:
# GroupBy: SQL 에서 지원되는 방식과 유사
df = pd.DataFrame({
  'A' : ['foo', 'bar', 'foo', 'bar'],
  'B' : ['one', 'one', 'two', 'three'],
  'C' : np.random.randn(4),
  'D' : np.random.randn(4)
})
print(df)

print(df.groupby('A').sum())
print(df.groupby(['A','B']).sum())

     A      B         C         D
0  foo    one  2.310304 -1.602232
1  bar    one -1.181152  1.301205
2  foo    two  0.237871 -0.351101
3  bar  three -0.065693 -1.019618
            C         D
A                      
bar -1.246845  0.281587
foo  2.548176 -1.953334
                  C         D
A   B                        
bar one   -1.181152  1.301205
    three -0.065693 -1.019618
foo one    2.310304 -1.602232
    two    0.237871 -0.351101


## Reshaping

In [21]:
# Stack
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])
s = pd.Series(np.arange(1.0, 5.0), index=index)
print(s)
df = s.unstack()
print(df)
print(df.stack())

# Pivot
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})
print(df)
print(pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']))

one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
       a    b
one  1.0  2.0
two  3.0  4.0
one  a    1.0
     b    2.0
two  a    3.0
     b    4.0
dtype: float64
        A  B    C         D         E
0     one  A  foo -0.406878 -0.950462
1     one  B  foo  0.030918 -2.100638
2     two  C  foo -0.721355  1.415724
3   three  A  bar -0.927376 -0.818047
4     one  B  bar  0.207421 -0.698892
5     one  C  bar  0.215073  1.833818
6     two  A  foo -0.713368  0.106301
7   three  B  foo  1.537568 -3.130458
8     one  C  foo  0.090697 -0.944782
9     one  A  bar  1.570254  0.224201
10    two  B  bar  1.701008  0.086360
11  three  C  bar  0.144171  0.287091
C             bar       foo
A     B                    
one   A  1.570254 -0.406878
      B  0.207421  0.030918
      C  0.215073  0.090697
three A -0.927376       NaN
      B       NaN  1.537568
      C  0.144171       NaN
two   A       NaN -0.713368
      B  1.701008       NaN
      C       NaN -0.721355


## Time Series

Time Series 데이터를 다룰 때 필요한 편리한 기능을 제공합니다.

상세참조자료: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html

In [23]:
import datetime

# 다양한 방법으로 DatetimeIndex 생성
dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime.datetime(2018, 1, 1)])
print(dti)

dti = pd.date_range('2018-01-01', periods=2, freq='H')
print(dti)

# DateTimeIndex 로 변환
print(pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])))
print(pd.to_datetime(['2005/11/23', '2010.12.31']))
print(pd.to_datetime([1349720105, 1349806505], unit='s')) # Epoch Timestamp Sec
print(pd.to_datetime([1349720105100, 1349720105200], unit='ms')) # Epoch Timestamp Millisec

# DateTimeIndex 로 변환시 에러 처리
try:
    pd.to_datetime(['2009/07/31', 'asd'], errors='raise')
except Exception as e:
    print("ERROR: {}".format(e))
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')
  
# Timezone 설정
dti = dti.tz_localize('UTC')
print(dti)

# Timezone 변경
dti = dti.tz_convert('Asia/Seoul')
print(dti)

DatetimeIndex(['2018-01-01', '2018-01-01', '2018-01-01'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00'], dtype='datetime64[ns]', freq='H')
0   2009-07-31
1   2010-01-10
2          NaT
dtype: datetime64[ns]
DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2012-10-08 18:15:05.100000', '2012-10-08 18:15:05.200000'], dtype='datetime64[ns]', freq=None)
ERROR: ('Unknown string format:', 'asd')
DatetimeIndex(['2018-01-01 00:00:00+00:00', '2018-01-01 01:00:00+00:00'], dtype='datetime64[ns, UTC]', freq='H')
DatetimeIndex(['2018-01-01 09:00:00+09:00', '2018-01-01 10:00:00+09:00'], dtype='datetime64[ns, Asia/Seoul]', freq='H')


In [24]:
# 시간단위 샘플링
idx = pd.date_range('2018-01-01', periods=5, freq='D')
ts = pd.Series(range(len(idx)), index=idx)
print(ts)
print(ts.resample('8H').mean())
print(ts.resample('2D').mean())

2018-01-01    0
2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
Freq: D, dtype: int64
2018-01-01 00:00:00    0.0
2018-01-01 08:00:00    NaN
2018-01-01 16:00:00    NaN
2018-01-02 00:00:00    1.0
2018-01-02 08:00:00    NaN
2018-01-02 16:00:00    NaN
2018-01-03 00:00:00    2.0
2018-01-03 08:00:00    NaN
2018-01-03 16:00:00    NaN
2018-01-04 00:00:00    3.0
2018-01-04 08:00:00    NaN
2018-01-04 16:00:00    NaN
2018-01-05 00:00:00    4.0
Freq: 8H, dtype: float64
2018-01-01    0.5
2018-01-03    2.5
2018-01-05    4.0
Freq: 2D, dtype: float64


In [26]:
## Version 0.24.1 이상
print(pd.__version__)
if pd.__version__ >= "0.24.1":
    # 시간 계산
    friday = pd.Timestamp('2018-01-05')
    print(friday.day_name())

    # Add 1 day
    saturday = friday + pd.Timedelta('1 day')
    print(saturday.day_name())

    # Add 1 business day (Friday --> Monday)
    monday = friday + pd.offsets.BDay()
    print(monday.day_name())

0.24.1
Friday
Saturday
Monday


# Read/Write

In [27]:
# CSV
df = pd.DataFrame({
  'A': 1.,
  'B': pd.Timestamp('20130102'),
  'C': pd.Series(1, index=list(range(4)), dtype='float32'),
  'D': np.array([3] * 4, dtype='int32'),
  'E': pd.Categorical(["test", "train", "test", "train"]),
  'F': 'foo'
})
print(df)
path_csv = "/tmp/foo.csv"
df.to_csv(path_csv, index=False)

df = pd.read_csv(path_csv, header=0)
print(df)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
     A           B    C  D      E    F
0  1.0  2013-01-02  1.0  3   test  foo
1  1.0  2013-01-02  1.0  3  train  foo
2  1.0  2013-01-02  1.0  3   test  foo
3  1.0  2013-01-02  1.0  3  train  foo
