# 데이터 전처리 : 범주형 데이터

In [1]:
# 데이터 생성
import pandas as pd
import numpy as np
data = pd.DataFrame({
    '메뉴': ['[인기]아이펠치킨','닭강정','간장치킨','마늘치킨','파닭','승일양념치킨','양념반후라이드반','황금후라이드','[베스트]풀잎치킨'],
    '가격': [16000,15000,14000,14000,14000,13000,13000,12000,9900],
    '호수' : [11,12,9,9,11,10,10,10,10],
    '칼로리' : [1200.0,1500.0,1600.0,1800.0,1300.0,1400.0,1300.0,1000.0,1000.0],
    '할인율' : [0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,np.nan],
    '할인가' : [8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,np.nan],
    '원산지' : ['국내산','브라질','국내산','국내산','브라질','국내산','국내산','국내산','국내산'],
    '살찔까요' : ['no','yes','yes','yes','yes','yes','yes','no','no'],
    '고민' : ['무조건먹자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','무조건먹자','무조건먹자']
})
data.to_csv('final_modudak.csv', index=False)


# 데이터 변경
data.loc[2,'원산지'] = '미국'

# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']]
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


# 레이블 인코딩

le.fit_transform(df['컬럼명']) *한번에 fit과 transform 변환

In [2]:
# 레이블(label) 인코딩 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['원산지'])
le.transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [3]:
# 데이터 변환
df['원산지'] = le.fit_transform(df['원산지'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['원산지'] = le.fit_transform(df['원산지'])


Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,no
1,15000,12,1500.0,2,yes
2,14000,9,1600.0,1,yes
3,14000,9,1800.0,0,yes
4,14000,11,1300.0,2,yes
5,13000,10,1400.0,0,yes
6,13000,10,1300.0,0,yes
7,12000,10,1000.0,0,no
8,9900,10,1000.0,0,no


 # 여러 개의 column을 레이블 인코딩하는 경우
 for in 반복문
자료형에 담긴 자료들을 하나씩 꺼내고 싶을때 사용한다.
이렇게 반복문을 활용해서 실행을 하게 되면 컬럼이 아무리 많아도 반복문을 통해서 한번에 인코딩 할 수 있다.

In [4]:
# 한 번에 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

cols = df.select_dtypes(include='object').columns

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])


Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


# 원핫(one-hot) 인코딩
각 카테고리별로 컬럼을 만들어서 해당되는 카테고리의 컬럼은 1, 나머지 컬럼은 0으로 인코딩

In [5]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()

# 원핫인코딩
# 데이터프레임을 입력받기 때문에 대괄호를 2번 써야 합니다. (df[['원산지']])
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
cat = ohe.fit_transform(df[['원산지']])

# 카테고리
ohe.categories_

# 피처(컬럼)이름과 카테고리
ohe.get_feature_names_out()

# 데이터프레임으로 변환
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat

# 데이터 프레임 합치기
df = pd.concat([df,df_cat],axis=1)
df

# 기존 컬럼 삭제
df = df.drop(['원산지'], axis=1)
df


Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


# 여러 개 컬럼 한 번에 원핫 인코딩

In [7]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()

# 원핫인코딩
cols = df.select_dtypes(include='object').columns
ohe = OneHotEncoder(sparse=False)
cat = ohe.fit_transform(df[cols])
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())

# 데이터프레임 합치기
df = pd.concat([df, df_cat], axis=1)

# 컬럼 삭제
df = df.drop(cols, axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,1.0,0.0,0.0,1.0,0.0


# 원핫인코딩: 판다스 활용

In [8]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()

# 판다스를 활용한 원핫 인코딩
df = pd.get_dummies(df)
df


Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1,0,0,1,0
1,15000,12,1500.0,0,0,1,0,1
2,14000,9,1600.0,0,1,0,0,1
3,14000,9,1800.0,1,0,0,0,1
4,14000,11,1300.0,0,0,1,0,1
5,13000,10,1400.0,1,0,0,0,1
6,13000,10,1300.0,1,0,0,0,1
7,12000,10,1000.0,1,0,0,1,0
8,9900,10,1000.0,1,0,0,1,0
