### Pandas
- Series and Dataframe
- 분석을 위한 전처리

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action="ignore")

### Series 클래스
- 넘파이 1차원 배열과 비슷
- series = index + value

In [5]:
# numpy vector
ary = np.array([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.dtype)

[1 2 3 4 'ruby']
object


In [13]:
# pandas series
ary = pd.Series([1,2,3,4,'ruby'], dtype=np.object)
print(ary)
print(ary.values)
print(type(ary.values))
print(ary.index)
print(type(ary.index))

0       1
1       2
2       3
3       4
4    ruby
dtype: object
[1 2 3 4 'ruby']
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [21]:
def seriesInfo(ary) :
    print('index + value : \n', ary)
    print('value : ', ary.values)
    print('value type : ', type(ary.values))
    print('index : ', ary.index)
    print('index type : ', type(ary.index))

- 인덱스의 라벨은 정수, 문자, 날짜, 시간으로 변경 가능

In [22]:
ary = pd.Series([1,2,3,4,5], dtype=np.int32, index=['강남','서초','방배','동작','사당'])

In [23]:
seriesInfo(ary)

index + value : 
 강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [24]:
# head추가
ary.index.name='구별'
seriesInfo(ary)

index + value : 
 구별
강남    1
서초    2
방배    3
동작    4
사당    5
dtype: int32
value :  [1 2 3 4 5]
value type :  <class 'numpy.ndarray'>
index :  Index(['강남', '서초', '방배', '동작', '사당'], dtype='object', name='구별')
index type :  <class 'pandas.core.indexes.base.Index'>


In [27]:
# 두개 이상의 인덱스 [[]]
print(ary['서초'])
print(ary[['강남', '방배']])

2
구별
강남    1
방배    3
dtype: int32


In [28]:
# index, values 모두 가져올 때
for idx, value in ary.items():
    print('idx : {}, value : {}'.format(idx, value))

idx : 강남, value : 1
idx : 서초, value : 2
idx : 방배, value : 3
idx : 동작, value : 4
idx : 사당, value : 5


In [29]:
# index만 가져올 때
for idx in ary.keys():
    print('idx : {}'.format(idx))

idx : 강남
idx : 서초
idx : 방배
idx : 동작
idx : 사당


In [36]:
# value만 가져올 때
for value in ary.values:
    print('value : {}'.format(value))

value : 1.0
value : 2.0
value : 4.0


In [31]:
ary = pd.Series(range(10, 21,2))
seriesInfo(ary)

index + value : 
 0    10
1    12
2    14
3    16
4    18
5    20
dtype: int64
value :  [10 12 14 16 18 20]
value type :  <class 'numpy.ndarray'>
index :  RangeIndex(start=0, stop=6, step=1)
index type :  <class 'pandas.core.indexes.range.RangeIndex'>


In [37]:
# dict형태로 series만들기
ary = pd.Series({'c' : 1, 'b' : 5, 'a' : -8, 'k' : 10}, dtype=np.float64)
seriesInfo(ary)

index + value : 
 c     1.0
b     5.0
a    -8.0
k    10.0
dtype: float64
value :  [ 1.  5. -8. 10.]
value type :  <class 'numpy.ndarray'>
index :  Index(['c', 'b', 'a', 'k'], dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [38]:
# 연산이 되더라도 index는 변함없음
ary*10

c     10.0
b     50.0
a    -80.0
k    100.0
dtype: float64

- fancy indexing & boolean indexing

In [41]:
print('fancy [0,2] indexing : \n{}'.format(ary[[0,2]]))

fancy [0,2] indexing : 
c    1.0
a   -8.0
dtype: float64


In [46]:
# 2의 배수
print('boolean ary % 2 == 0 :\n{}'.format(ary[ary % 2 == 0]))

boolean ary % 2 == 0 :
a    -8.0
k    10.0
dtype: float64


### 시간, 날짜 불러오기

In [54]:
from datetime import date, datetime, timedelta
from dateutil.parser import parse

In [56]:
strDate = date(2020,2,25)
print(strDate)
print(strDate + timedelta(days=1))

2020-02-25
2020-02-26


In [66]:
# 평균이 50이고 편차가 5인 정규분포 데이터를 10일간 만들기
fac01 = pd.Series([int(x) for x in np.random.normal(50, 5, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac01)

index + value : 
 2020-02-25    43
2020-02-26    54
2020-02-27    53
2020-02-28    35
2020-02-29    47
2020-03-01    42
2020-03-02    50
2020-03-03    52
2020-03-04    48
2020-03-05    57
dtype: int64
value :  [43 54 53 35 47 42 50 52 48 57]
value type :  <class 'numpy.ndarray'>
index :  Index([2020-02-25, 2020-02-26, 2020-02-27, 2020-02-28, 2020-02-29, 2020-03-01,
       2020-03-02, 2020-03-03, 2020-03-04, 2020-03-05],
      dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [65]:
# 평균이 70이고 편차가 8인 정규분포 데이터를 10일간 만들기
fac02 = pd.Series([int(x) for x in np.random.normal(70, 8, (10))], index=[strDate + timedelta(days=day) for day in range(10)])
seriesInfo(fac02)

index + value : 
 2020-02-25    60
2020-02-26    82
2020-02-27    47
2020-02-28    79
2020-02-29    67
2020-03-01    65
2020-03-02    68
2020-03-03    78
2020-03-04    71
2020-03-05    76
dtype: int64
value :  [60 82 47 79 67 65 68 78 71 76]
value type :  <class 'numpy.ndarray'>
index :  Index([2020-02-25, 2020-02-26, 2020-02-27, 2020-02-28, 2020-02-29, 2020-03-01,
       2020-03-02, 2020-03-03, 2020-03-04, 2020-03-05],
      dtype='object')
index type :  <class 'pandas.core.indexes.base.Index'>


In [67]:
fac01 + fac02

2020-02-25    103
2020-02-26    136
2020-02-27    100
2020-02-28    114
2020-02-29    114
2020-03-01    107
2020-03-02    118
2020-03-03    130
2020-03-04    119
2020-03-05    133
dtype: int64

In [70]:
# casting
print(set(fac01.index))
print(list(fac01.index))

{datetime.date(2020, 2, 26), datetime.date(2020, 3, 4), datetime.date(2020, 2, 25), datetime.date(2020, 3, 5), datetime.date(2020, 2, 29), datetime.date(2020, 3, 2), datetime.date(2020, 2, 28), datetime.date(2020, 2, 27), datetime.date(2020, 3, 3), datetime.date(2020, 3, 1)}
[datetime.date(2020, 2, 25), datetime.date(2020, 2, 26), datetime.date(2020, 2, 27), datetime.date(2020, 2, 28), datetime.date(2020, 2, 29), datetime.date(2020, 3, 1), datetime.date(2020, 3, 2), datetime.date(2020, 3, 3), datetime.date(2020, 3, 4), datetime.date(2020, 3, 5)]


In [71]:
for idx in fac01.index :
    print(idx)

2020-02-25
2020-02-26
2020-02-27
2020-02-28
2020-02-29
2020-03-01
2020-03-02
2020-03-03
2020-03-04
2020-03-05


- Dataframe

In [None]:
# 영화진흥위원회에서 제공되는 일일 Box office 순위
movie_url = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/boxoffice/searchDailyBoxOfficeList.json?key=430156241533f1d058c603178cc3ca0e&targetDt=20210225"