## DataFrame load

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('http://bit.ly/ds-korean-idol')

### Select by data type(select_dtypes)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   이름       15 non-null     object 
 1   그룹       14 non-null     object 
 2   소속사      15 non-null     object 
 3   성별       15 non-null     object 
 4   생년월일     15 non-null     object 
 5   키        13 non-null     float64
 6   혈액형      15 non-null     object 
 7   브랜드평판지수  15 non-null     int64  
dtypes: float64(1), int64(1), object(6)
memory usage: 1.1+ KB


### 1-1 Select 'object' type in column

In [4]:
df.select_dtypes(include = 'object')

Unnamed: 0,이름,그룹,소속사,성별,생년월일,혈액형
0,지민,방탄소년단,빅히트,남자,1995-10-13,A
1,지드래곤,빅뱅,YG,남자,1988-08-18,A
2,강다니엘,,커넥트,남자,1996-12-10,A
3,뷔,방탄소년단,빅히트,남자,1995-12-30,AB
4,화사,마마무,RBW,여자,1995-07-23,A
5,정국,방탄소년단,빅히트,남자,1997-09-01,A
6,민현,뉴이스트,플레디스,남자,1995-08-09,O
7,소연,아이들,큐브,여자,1998-08-26,B
8,진,방탄소년단,빅히트,남자,1992-12-04,O
9,하성운,핫샷,스타크루이엔티,남자,1994-03-22,A


In [7]:
df.select_dtypes(exclude = 'object')

Unnamed: 0,키,브랜드평판지수
0,173.6,10523260
1,177.0,9916947
2,180.0,8273745
3,178.0,8073501
4,162.1,7650928
5,178.0,5208335
6,182.3,4989792
7,,4668615
8,179.2,4570308
9,167.1,4036489


In [9]:
num_cols = df.select_dtypes(exclude = 'object').columns

In [10]:
df[num_cols]

Unnamed: 0,키,브랜드평판지수
0,173.6,10523260
1,177.0,9916947
2,180.0,8273745
3,178.0,8073501
4,162.1,7650928
5,178.0,5208335
6,182.3,4989792
7,,4668615
8,179.2,4570308
9,167.1,4036489


# one-hot-encoding

In [11]:
df.head()

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수
0,지민,방탄소년단,빅히트,남자,1995-10-13,173.6,A,10523260
1,지드래곤,빅뱅,YG,남자,1988-08-18,177.0,A,9916947
2,강다니엘,,커넥트,남자,1996-12-10,180.0,A,8273745
3,뷔,방탄소년단,빅히트,남자,1995-12-30,178.0,AB,8073501
4,화사,마마무,RBW,여자,1995-07-23,162.1,A,7650928


In [12]:
blood_map = {
    'A' : 0,
    'B': 1,
    'AB': 2,
    '0': 3

}

In [13]:
df['혈액형_code'] = df['혈액형'].map(blood_map)

In [14]:
df.head()

Unnamed: 0,이름,그룹,소속사,성별,생년월일,키,혈액형,브랜드평판지수,혈액형_code
0,지민,방탄소년단,빅히트,남자,1995-10-13,173.6,A,10523260,0.0
1,지드래곤,빅뱅,YG,남자,1988-08-18,177.0,A,9916947,0.0
2,강다니엘,,커넥트,남자,1996-12-10,180.0,A,8273745,0.0
3,뷔,방탄소년단,빅히트,남자,1995-12-30,178.0,AB,8073501,2.0
4,화사,마마무,RBW,여자,1995-07-23,162.1,A,7650928,0.0


In [15]:
df['혈액형_code'].value_counts()

0.0    7
2.0    2
1.0    2
Name: 혈액형_code, dtype: int64

In [16]:
df['혈액형_code']

0     0.0
1     0.0
2     0.0
3     2.0
4     0.0
5     0.0
6     NaN
7     1.0
8     NaN
9     0.0
10    0.0
11    1.0
12    2.0
13    NaN
14    NaN
Name: 혈액형_code, dtype: float64

In [17]:
pd.get_dummies(df['혈액형_code'])

Unnamed: 0,0.0,1.0,2.0
0,1,0,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0
5,1,0,0
6,0,0,0
7,0,1,0
8,0,0,0
9,1,0,0


In [19]:
#use prefix 
pd.get_dummies(df['혈액형_code'], prefix = '혈액형')

Unnamed: 0,혈액형_0.0,혈액형_1.0,혈액형_2.0
0,1,0,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0
5,1,0,0
6,0,0,0
7,0,1,0
8,0,0,0
9,1,0,0
