# Pandas 기초

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Pandas 개념
- pandas 는 dict 를 확장한 개념이다
- 2차원 테이블 형태를 주로 다룬다
- 주로 칼럼 위주로 처리가 일어난다
- pandas 데이터에는 항상 칼럼명과 인덱스가 붙는다
- 데이터타입 종류
    - pd.DataFrame
    - pd.Series

In [2]:
d = {'a':[1,2], 'b': [11,12], 'c': [21,22]}
df = pd.DataFrame(d)
df

Unnamed: 0,a,b,c
0,1,11,21
1,2,12,22


In [4]:
df['b']

0    11
1    12
Name: b, dtype: int64

In [8]:
type(df), type(df['a'])

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [5]:
df2 = pd.DataFrame(np.arange(12).reshape(4,3))
df2

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [7]:
df2[2] # 칼럼명이 0인 열을 가져온다

0     2
1     5
2     8
3    11
Name: 2, dtype: int32

In [13]:
df2.values

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

### Series

In [46]:
s = pd.Series([11,12,13,14])
s

0    11
1    12
2    13
3    14
dtype: int64

In [47]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [48]:
s.index = range(100,104)

In [49]:
s

100    11
101    12
102    13
103    14
dtype: int64

In [50]:
s.name

In [51]:
s.name='test'
s

100    11
101    12
102    13
103    14
Name: test, dtype: int64

In [53]:
s[100]

11

In [54]:
s[[100,102]]

100    11
102    13
Name: test, dtype: int64

In [15]:
s[[True,False, False,True]]

10    1
13    4
Name: test, dtype: int64

In [16]:
s.name, s.index.name

('test', None)

In [18]:
s.index.name = 'num'
s.name, s.index.name

('test', 'num')

### DataFrame

In [15]:
d = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
d

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [16]:
d.columns

RangeIndex(start=0, stop=3, step=1)

In [17]:
d.index

RangeIndex(start=0, stop=3, step=1)

In [2]:
d = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], columns=['col1','col2','col3'])
d

Unnamed: 0,col1,col2,col3
0,1,2,3
1,4,5,6
2,7,8,9


In [3]:
d.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [4]:
d.index

RangeIndex(start=0, stop=3, step=1)

In [5]:
d.columns = ['a','b','c'] # list('abc')

In [6]:
d

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [7]:
d.index=['row1','row2','row3']
d

Unnamed: 0,a,b,c
row1,1,2,3
row2,4,5,6
row3,7,8,9


In [8]:
d.index = range(3)
d

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [9]:
d['c']

0    3
1    6
2    9
Name: c, dtype: int64

In [10]:
d['c'][1], d['b'][2]

(6, 8)

- numpy 식으로 인덱싱/슬라이싱 => d.iloc[행번호,열번호]

In [11]:
d

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [20]:
d.iloc[1,2], d.iloc[2,1]

(6, 8)

In [26]:
d.iloc[0]

a    1
b    2
c    3
Name: 0, dtype: int64

In [27]:
d.iloc[0,2]

3

In [28]:
d.iloc[:1,1:]

Unnamed: 0,b,c
0,2,3


- s.name 은 칼럼명, s.index.name 은 d.index.name 이 된다

In [21]:
d.index.name

'num'

In [22]:
d.columns

Index(['test'], dtype='object')

In [23]:
d.index

RangeIndex(start=10, stop=14, step=1, name='num')

In [29]:
d = pd.DataFrame({'name': ['hong', 'jang', 'dooley'],
                 'age': [22,33,11],
                 'address': ['seoul', 'daejeon', 'etc']})
d

Unnamed: 0,address,age,name
0,seoul,22,hong
1,daejeon,33,jang
2,etc,11,dooley


In [12]:
d = pd.DataFrame([['hong',22,'seoul'], ['jang',33,'daejeon'], ['dooley',11,'etc']],
                columns=['name','age','address'])
d

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon
2,dooley,11,etc


In [32]:
d = pd.DataFrame([['hong',22,'seoul'], ['jang',33,'daejeon'], ['dooley',11,'etc']])
d.columns = ['name','age','address']
d

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon
2,dooley,11,etc


- 칼럼 하나를 뽑아내면 Series 가 된다.
- 칼럼명은 Series 의 name 이 된다

In [33]:
d['name']

0      hong
1      jang
2    dooley
Name: name, dtype: object

In [34]:
type(d['name'])

pandas.core.series.Series

In [35]:
d.shape

(3, 3)

In [36]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
name       3 non-null object
age        3 non-null int64
address    3 non-null object
dtypes: int64(1), object(2)
memory usage: 152.0+ bytes


In [37]:
d.describe()

Unnamed: 0,age
count,3.0
mean,22.0
std,11.0
min,11.0
25%,16.5
50%,22.0
75%,27.5
max,33.0


- 팬시 색인은 칼럼을 선택하고, 불리언 색인은 레코드를 선택한다
- 범위를 지정하면 레코드가 선택된다

In [23]:
d

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon
2,dooley,11,etc


In [40]:
d[['name','age']] # 팬시 색인

Unnamed: 0,name,age
0,hong,22
1,jang,33
2,dooley,11


In [41]:
d.iloc[:2]

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon


In [42]:
d[:2] # 범위(슬라이싱) 지정

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon


In [16]:
d[[True, True, False]]

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon


In [14]:
d[d['age']>20] # select * from d where age>20

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon


In [15]:
d['age']>20

0     True
1     True
2    False
Name: age, dtype: bool

### 칼럼명으로 칼럼 지정

In [26]:
d

Unnamed: 0,name,age,address
0,hong,22,seoul
1,jang,33,daejeon
2,dooley,11,etc


In [44]:
d['name']

0      hong
1      jang
2    dooley
Name: name, dtype: object

In [45]:
d.name

0      hong
1      jang
2    dooley
Name: name, dtype: object

In [27]:
d.age

0    22
1    33
2    11
Name: age, dtype: int64

In [28]:
d.address

0      seoul
1    daejeon
2        etc
Name: address, dtype: object

In [38]:
d.name[:2]

0    hong
1    jang
Name: name, dtype: object

In [39]:
d['name'][:2]

0    hong
1    jang
Name: name, dtype: object

### NULL 처리
- None 또는 np.nan 으로 널값을 할당 할 수 있다
- None, null, nan, NaN, np.nan, na

In [17]:
#0,,2
#3,4,5

d = pd.DataFrame([[0,None,2], [3,4,5]]) # np.nan
d

Unnamed: 0,0,1,2
0,0,,2
1,3,4.0,5


In [18]:
d[1][0] # d.iloc[0,1]

nan

In [19]:
type(d[1][0])

numpy.float64

In [49]:
d[2][1]=None
d

Unnamed: 0,0,1,2
0,0,,2.0
1,3,4.0,


In [50]:
d.iloc[1,0] = None
d

Unnamed: 0,0,1,2
0,0.0,,2.0
1,,4.0,


In [51]:
d.isna()

Unnamed: 0,0,1,2
0,False,True,False
1,True,False,True


In [52]:
d.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,True,False,True


In [54]:
d.notnull()

Unnamed: 0,0,1,2
0,True,False,True
1,False,True,False


### DataFrame 이름 부여

In [58]:
d = pd.DataFrame([[0,1],[2,3],[4,5]])
d

Unnamed: 0,0,1
0,0,1
1,2,3
2,4,5


In [59]:
d.index.name = 'num'
d.columns.name = 'cols'
d

cols,0,1
num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1
1,2,3
2,4,5


In [60]:
d.columns = ['col1','col2']
d.index = [100,101,102]
d

Unnamed: 0,col1,col2
100,0,1
101,2,3
102,4,5


In [61]:
d.index.name = 'nums'
d.columns.name = 'cols'
d

cols,col1,col2
nums,Unnamed: 1_level_1,Unnamed: 2_level_1
100,0,1
101,2,3
102,4,5


### 항목 찾기

In [64]:
d.loc[101,'col2']

3

In [67]:
d.iloc[1,1]

3

In [69]:
d.iloc[:2,1:]

cols,col2
nums,Unnamed: 1_level_1
100,1
101,3


In [70]:
d.iloc[0] # 결과는 Series

cols
col1    0
col2    1
Name: 100, dtype: int64

### 인덱스

In [71]:
d = pd.DataFrame(np.arange(12).reshape(4,3))
d

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [72]:
display(d.columns, d.index)

RangeIndex(start=0, stop=3, step=1)

RangeIndex(start=0, stop=4, step=1)

In [2]:
d = pd.DataFrame(np.arange(12).reshape(4,3),
                 columns=['col1','col2','col3'],
                 index=[101,102,103,104])
d

Unnamed: 0,col1,col2,col3
101,0,1,2
102,3,4,5
103,6,7,8
104,9,10,11


In [78]:
display(d.columns, d.index)

Index(['col1', 'col2', 'col3'], dtype='object')

Int64Index([101, 102, 103, 104], dtype='int64')

In [79]:
d.index=list('abcd')
d

Unnamed: 0,col1,col2,col3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [80]:
d.index

Index(['a', 'b', 'c', 'd'], dtype='object')

### reindex()
- 기존의 인덱스를 재배치한다

In [82]:
d.reindex(list('dcba'))

Unnamed: 0,col1,col2,col3
d,9,10,11
c,6,7,8
b,3,4,5
a,0,1,2


In [85]:
d

Unnamed: 0,col1,col2,col3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [86]:
d.reindex(list('abcde'))

Unnamed: 0,col1,col2,col3
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,9.0,10.0,11.0
e,,,


In [87]:
d.reindex(list('abcde'), fill_value=0)

Unnamed: 0,col1,col2,col3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,0,0,0


In [88]:
d.reindex(columns=['col2','col3','col4'])

Unnamed: 0,col2,col3,col4
a,1,2,
b,4,5,
c,7,8,
d,10,11,
