# Pandas
* https://pandas.pydata.org/
* Python Data Analysis Library
* R의 data.frame을 본 떠서 설계한 DataFrame 기반
* 고성능  NumPy 기반
* 간편한 데이타 구조 및 분석 도구
* SQL 처럼 테이블에 쿼리, 조인 수행
* 각 열의 타입이 다르거나 문자 허용
* NumPy + Dict
    * Record + Column
    * 2차원 관계형 DBMS 테이블 처럼 사용
    * Series, DataFrame
![](https://www.bedrockdata.com/hs-fs/hubfs/pandas-logo.png?width=263&name=pandas-logo.png)

## Module Import

In [2]:
import pandas as pd

## Pandas Data Structure
* Series :  1D
* DataFrame : 2D
* Panel : 3D

## Series
* 일련의 객체를 담을 수 있는 1차원 배열 자료구조
* Index로 요소 접근
    * Integer Index
    * Label Index

### Series생성자
* `pd.Series([data, index, dtype, name, copy])`
    * data : list, NumPy ndarray, dictionary

In [3]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

### Serise 주요 속성
* 데이타 접근 : 인덱스
* values : 실제 데이타, NumPy ndarray
* index : 데이타를 접근하기 위한 색인
    * 숫자(기본) : RangeIndex
* dtype
* shape
* size
* ndim

In [4]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj[0], obj[1]

(4, 7)

In [17]:
obj.shape, obj.dtype, obj.size, obj.ndim

((4,), dtype('int64'), 4, 1)

### Index 지정
* 생성자  Index 지정
* index 속성 지정
* 숫자 인덱스 동시 사용 가능

In [12]:
obj2 = pd.Series([4,7,-5,3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a'], obj2[2]

(-5, -5)

In [13]:
obj.index = ['a', 'b', 'c', 'd']
obj

a    4
b    7
c   -5
d    3
dtype: int64

In [15]:
obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [14]:
obj['a'], obj[0]

(4, 4)

In [58]:
obj2 = pd.Series(1)
obj2

0    1
dtype: int64

In [59]:
obj2 = pd.Series(1, index=[0,1,2,3,4])
obj2

0    1
1    1
2    1
3    1
4    1
dtype: int64

In [60]:
obj2.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [61]:
obj3 = pd.Series(10, index=pd.RangeIndex(5))
obj3

0    10
1    10
2    10
3    10
4    10
dtype: int64

### Dictionary로 Series 생성
* Key : index
* value : value

In [22]:
obj3 = pd.Series({"lee": 27, "kim": 25, "park": 30, "choi": 19})
obj3

lee     27
kim     25
park    30
choi    19
dtype: int64

In [24]:
print(obj3.index)
print(obj3.values)

Index(['lee', 'kim', 'park', 'choi'], dtype='object')
[27 25 30 19]


### 인덱스로 데이타 선택
* 하나의 요소 선택 : 인덱스 1개
    * `data[1]`, `data['a']
* 여러 요소 선택 : 인덱스이 배열
    *  `data[['a', 'b']]
* bool indexing
    * `data[data>2]`

In [9]:
print(obj[2], obj2[2], obj2['a'])

-5 -5 -5


In [14]:
obj[[1,3]]

1    7
3    3
dtype: int64

In [12]:
obj2[['a', 'b']]

a   -5
b    7
dtype: int64

In [21]:
obj2[obj2>2]

d    4
b    7
c    3
dtype: int64

## DataFrame
* 엑셀 스프레드 시트 형식의 자료 구조
* 2차원 테이블 데이타 구조
* row와 column에 대한 색인
    * 색인(Index)의 모양이 같은 Series를 담고 있는 파이썬 dictionary

## DataFrame 생성자
* `DataFrame([data, index, columns, dtype, copy])`
### 주요속성
* index
* columns
* values
* dtypes
* T

## List로 생성

In [25]:
df = pd.DataFrame([['Lee', 27], ['Kim', 24], ['Park', 31], ['Choi', 15]])
df

Unnamed: 0,0,1
0,Lee,27
1,Kim,24
2,Park,31
3,Choi,15


In [28]:
df2 = pd.DataFrame([['Lee', 27], ['Kim', 24], ['Park', 31], ['Choi', 15]], index=['a', 'b', 'c', 'd'], columns=['name', 'age'])
df2

Unnamed: 0,name,age
a,Lee,27
b,Kim,24
c,Park,31
d,Choi,15


In [67]:
df2['name'][0], df2['age'][1]

('Lee', 24)

In [65]:
df2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [66]:
df2.columns

Index(['name', 'age'], dtype='object')

In [69]:
df2.dtypes, 

(name    object
 age      int64
 dtype: object,)

In [83]:
df2.values

array([['Lee', 27],
       ['Kim', 24],
       ['Park', 31],
       ['Choi', 15]], dtype=object)

## Series로 생성

In [31]:
name = pd.Series(['Lee', 'Kim', 'Park'])
age = pd.Series([27,24,15])
df3 = pd.DataFrame({'name':name, 'age': age})
df3

Unnamed: 0,name,age
0,Lee,27
1,Kim,24
2,Park,15


In [32]:
df3.columns

Index(['name', 'age'], dtype='object')

### columns, 순서 재배치

In [33]:
df4 = pd.DataFrame({'name':name, 'age':age}, columns=['age', 'name'])
df4

Unnamed: 0,age,name
0,27,Lee
1,24,Kim
2,15,Park


In [34]:
df4.columns

Index(['age', 'name'], dtype='object')

### Dictionary로 생성

In [38]:
df5 = pd.DataFrame({})
df5

In [39]:
df6 = pd.DataFrame({'name':['Lee', 'Kim', 'Park'],
                   'age': [27, 24, 15]})
df6

Unnamed: 0,name,age
0,Lee,27
1,Kim,24
2,Park,15


In [98]:
df =  pd.DataFrame({'name':['Lee', 'Kim', 'Park'],
                   'age': [27, 24, 15],
                   'addrees': ['Seoul', 'Busan', 'Gwangju']},
                  index=['one', 'two', 'three'])
df

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
two,Kim,24,Busan
three,Park,15,Gwangju


## 컬럼 접근
* column index
    * multi column(Fancy) Indexing
* property
* _행 접근과 혼란_
    * Boolean Indexing
    * Slicing


In [99]:
df['name'], df['age']

(one       Lee
 two       Kim
 three    Park
 Name: name, dtype: object, one      27
 two      24
 three    15
 Name: age, dtype: int64)

In [100]:
df.name, df.age

(one       Lee
 two       Kim
 three    Park
 Name: name, dtype: object, one      27
 two      24
 three    15
 Name: age, dtype: int64)

In [101]:
df[['name', 'age']]

Unnamed: 0,name,age
one,Lee,27
two,Kim,24
three,Park,15


In [107]:
df[ [True, False, True]]

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
three,Park,15,Gwangju


In [111]:
df['age'] > 20

one       True
two       True
three    False
Name: age, dtype: bool

In [112]:
df[ df['age'] > 20]

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
two,Kim,24,Busan


In [110]:
df[0:2]

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
two,Kim,24,Busan


## 행(row) 접근
* ix[n] : deprecated
* iloc[n] : 숫자 인덱스로 접근 
* loc[label] : 레이블 인덱스로 접근

In [77]:
df.iloc[0]


name    Lee
age      27
Name: one, dtype: object

In [78]:
df.iloc[1]

name    Kim
age      24
Name: two, dtype: object

In [117]:
df.iloc[0:2]

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
two,Kim,24,Busan


In [113]:
df.iloc[0,1]

27

In [118]:
df.iloc[:, :2]

Unnamed: 0,name,age
one,Lee,27
two,Kim,24
three,Park,15


In [114]:
df.loc['two']

name         Kim
age           24
addrees    Busan
Name: two, dtype: object

In [116]:
df.loc['two', 'age']

24

In [119]:
df.loc[:'three']

Unnamed: 0,name,age,addrees
one,Lee,27,Seoul
two,Kim,24,Busan
three,Park,15,Gwangju


In [120]:
df.loc[:, 'name':'age']

Unnamed: 0,name,age
one,Lee,27
two,Kim,24
three,Park,15


## 하나의 값(Scalar) 접근
* pd.at[n,m]
* pd.iat[n,m]
* 속도가 빠르다

## Columns, Index Name
* df.columns.name
* df.index.name

In [80]:
df

Unnamed: 0,name,age
one,Lee,27
two,Kim,24
three,Park,15


In [81]:
df.index.name = 'number'
df.columns.name = 'attribute'
df

attribute,name,age
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Lee,27
two,Kim,24
three,Park,15


## Data 일부 보기
* `df.head([n])`
* `df.tail([n])`

In [82]:
df.head(2)

attribute,name,age
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Lee,27
two,Kim,24


### 컬럼 추가

In [28]:
friends['job'] = 'student'
friends

Unnamed: 0,name,age,job
0,Lee,27,student
1,Kim,24,student
2,Park,15,student


### 연산과 함께 컬럼 추가

In [None]:
friends['adult'] = friends['age'] > 19

In [None]:
friends

In [None]:
friends.loc[friends['age'] > 19]

## Data 삭제
* df.drop(label, axis=0, index, columns, inplace=False)
    * label : index or column
    * axis = 0
        * 0 : row
        * 1 : column
    * inplace : 원본 변경 
* del 

## 정렬
* `df.sort_index(axis, ascending)` 
    * 반환 : 정렬된 DataFrame
    * axis = 0
        * 0 : index(행 정렬, 기본 값)
        * 1 : columns(열 정렬)
    * ascending = True
        * True : 오름차순(기본 값
        * False : 내림차순
* `df.sort_values(by='key')`:
    * by='key' : 'key' column을 기준으로 정렬

In [90]:
df.sort_index(ascending=False)

attribute,name,age
number,Unnamed: 1_level_1,Unnamed: 2_level_1
two,Kim,24
three,Park,15
one,Lee,27


In [88]:
df.sort_index(axis=1)

attribute,age,name
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,27,Lee
two,24,Kim
three,15,Park


In [91]:
df.sort_values(by='age')

attribute,name,age
number,Unnamed: 1_level_1,Unnamed: 2_level_1
three,Park,15
two,Kim,24
one,Lee,27


In [94]:
df.sort_values(by='age', ascending=False)

attribute,name,age
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Lee,27
two,Kim,24
three,Park,15


## NaN
* df.isnull()
* df.isna()
    * pd.isna(df)
* df.notnull()
* df.notna()
* df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
    * axis : {0,1}
    * how : {'any', 'all'}
    * thresh : NA가 아닌 최소 갯수
        * 예) thresh=2: 적어도 2개가 NA가 아닌 값이 있는 행은 남기기
* df.fillna(value, method, axis, inplace, limit, downcast)
    * value : 채워 넣을 값
    * method : { 'backfill', 'bfill', 'pad', 'ffill', None}
        * backfill == bfill
        * pad == ffill
    

## Index
* Numeric Index
    * RangeIndex
    * Int64Index
    * Uint64Index
    * Float64Index
* CategoricalIndex
* IntervalIndex
* MultiIndex
* DatatimeIndex
* TimedeltaIndex
* PeriodIndex

In [46]:
rIdx = pd.RangeIndex(10)
rIdx

RangeIndex(start=0, stop=10, step=1)

In [47]:
df = pd.DataFrame({})
df.index

Index([], dtype='object')

In [49]:
dateIdx = pd.date_range('20200101', periods=6)
dateIdx

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

# Pandas Dtypes
* `pd.Timestamp()`
* `pd.Categorical()`

# File I/O
* `read_csv()`
* `read_excel()`

## Concat

## Merge

## Groupby

## Pivot