## Machine learning - scikit learning in python
* 데이터 생성
https://scikit-learn.org/stable/

1. 데이터 전처리 : 범주형 데이터

레이블(Label) 인코딩, 원핫(One-hot) 인코딩
2. 데이터 전처리 : 수치형 데이터

표준화(Standard) 스케일, 민맥스(MinMax) 스케일
3. 머신러닝을 통해 예측

모델 선택 – 학습 fit() – 예측 predict()

지도 학습 (Supervised Learning), 예 - 분류 (Classificaation)을 통해 다양한 피쳐오 분류 결정값인 label을 학습한 후 별도의 데이터 세트에서 미지의 레이블을 예측)
학습 데이터 세트와 별도로 주어지는 데이터 세트가 존재

In [1]:
# 데이터 생성
import pandas as pd
import numpy as np
data = pd.DataFrame({
    '메뉴': ['[인기]아이펠치킨','닭강정','간장치킨','마늘치킨','파닭','승일양념치킨','양념반후라이드반','황금후라이드','[베스트]풀잎치킨'],
    '가격': [16000,15000,14000,14000,14000,13000,13000,12000,9900],
    '호수' : [11,12,9,9,11,10,10,10,10],
    '칼로리' : [1200.0,1500.0,1600.0,1800.0,1300.0,1400.0,1300.0,1000.0,1000.0],
    '할인율' : [0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,np.nan],
    '할인가' : [8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,np.nan],
    '원산지' : ['국내산','브라질','국내산','국내산','브라질','국내산','국내산','국내산','국내산'],
    '살찔까요' : ['no','yes','yes','yes','yes','yes','yes','no','no'],
    '고민' : ['무조건먹자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','무조건먹자','무조건먹자']
})
data.to_csv('final_modudak.csv', index=False)
data

Unnamed: 0,메뉴,가격,호수,칼로리,할인율,할인가,원산지,살찔까요,고민
0,[인기]아이펠치킨,16000,11,1200.0,0.5,8000.0,국내산,no,무조건먹자
1,닭강정,15000,12,1500.0,0.2,12000.0,브라질,yes,먹지말자
2,간장치킨,14000,9,1600.0,0.2,11200.0,국내산,yes,먹지말자
3,마늘치킨,14000,9,1800.0,0.2,11200.0,국내산,yes,먹지말자
4,파닭,14000,11,1300.0,0.2,11200.0,브라질,yes,먹지말자
5,승일양념치킨,13000,10,1400.0,0.2,10400.0,국내산,yes,먹지말자
6,양념반후라이드반,13000,10,1300.0,0.2,10400.0,국내산,yes,먹지말자
7,황금후라이드,12000,10,1000.0,0.2,9600.0,국내산,no,무조건먹자
8,[베스트]풀잎치킨,9900,10,1000.0,,,국내산,no,무조건먹자


In [2]:
# 데이터 변경
# 간장치킨의 원산지를 미국으로 변환시키는 것
data.loc[2,'원산지'] = '미국'

In [3]:
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


## 데이터 전처리: 범주형 데이터
숫자 형태로 인코딩 하는 두가지 방법

* 레이블 인코딩 (e.g., 브라질 -> 0, 국내산 -> 1)
* 원핫 인코딩

In [4]:
df.info() #데이터 type 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   가격      9 non-null      int64  
 1   호수      9 non-null      int64  
 2   칼로리     9 non-null      float64
 3   원산지     9 non-null      object 
 4   살찔까요    9 non-null      object 
dtypes: float64(1), int64(2), object(2)
memory usage: 488.0+ bytes


## 레이블 (label) 인코딩

In [5]:
# 레이블(label) 인코딩 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['원산지']) #레이블 인코딩을 적용하기
le.transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [6]:
# fit_transform
# fit과 transform을 한번에! -> in [11]와 같은 결과임
le.fit_transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [7]:
# 데이터 변환
df['원산지'] = le.fit_transform(df['원산지'])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,no
1,15000,12,1500.0,2,yes
2,14000,9,1600.0,1,yes
3,14000,9,1800.0,0,yes
4,14000,11,1300.0,2,yes
5,13000,10,1400.0,0,yes
6,13000,10,1300.0,0,yes
7,12000,10,1000.0,0,no
8,9900,10,1000.0,0,no


In [8]:
# 데이터 변환
le = LabelEncoder()
df['살찔까요'] = le.fit_transform(df['살찔까요']) 
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


## 레이블 인코딩 (심화)
* 한번에인코딩하기 

In [9]:
# 활용할 데이터 선택 - 원본 데이터 불러오기
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [10]:
# 데이터가 object 타입인 column 선택 ['원산지', '살찔까요']

# 방법1 
cols = ['원산지', '살찔까요']

# 방법2
cols = df.select_dtypes(include='object').columns

## for in 반복문

* 자료형에 담긴 자료들을 하나씩 꺼내고 싶을때 사용한다.
* 이렇게 반복문을 활용해서 실행을 하게 되면 컬럼이 아무리 많아도 반복문을 통해서 한번에 인코딩 할 수 있다.

# 한 번에 레이블 인코딩 - LabelEncoder는 sklearn의 전처리 도구 중 하나로, 문자형 데이터를 숫자로 변환해줍니다.

* for col in cols: → cols 안에 있는 각 열(column)에 대해 반복 작업을 수행합니다. (col은 현재 처리 중인 열 이름)

* le = LabelEncoder() → 매 열마다 새로운 인코더 객체를 만들어, 해당 열의 고유값에 맞는 숫자 매핑을 수행합니다.

* df[col] = le.fit_transform(df[col]) → 실제로 인코딩된 값을 원래 데이터프레임의 열에 덮어쓰기합니다.

In [11]:
# 한 번에 레이블 인코딩
from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [12]:
# 인코딩 결과 확인
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


In [13]:
# object 타입 선택 ['원산지', '살찔까요']
# cols = ['원산지', '살찔까요']
#다른 방식으로 인코딩하기
df.select_dtypes(include='object').columns

Index([], dtype='object')

In [14]:
cols = df.select_dtypes(include='object').columns

In [15]:
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


## 4. 원핫(one-hot) 인코딩
* 각 카테고리별로 컬럼을 만들어서 해당되는 카테고리의 컬럼은 1, 나머지 컬럼은 0으로 인코딩

In [16]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [17]:
# 원핫인코딩
# 데이터프레임을 입력받기 때문에 대괄호를 2번 써야 합니다. (df[['원산지']])
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
cat = ohe.fit_transform(df[['원산지']]) #시리즈 형태
cat #원샷 인코딩 (아래로 바로 인코딩하는 방법)

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [18]:
# 카테고리
ohe.categories_ #언더바까지 해야해

[array(['국내산', '미국', '브라질'], dtype=object)]

In [19]:
# 피처(컬럼)이름과 카테고리
ohe.get_feature_names_out()

array(['원산지_국내산', '원산지_미국', '원산지_브라질'], dtype=object)

In [20]:
# 데이터프레임으로 변환 (컬럼명까지 쓰기)
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat
#원샷으로 인코딩하기

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0
7,1.0,0.0,0.0
8,1.0,0.0,0.0


In [21]:
# 데이터 프레임 합치기
df = pd.concat([df,df_cat],axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0
5,13000,10,1400.0,국내산,yes,1.0,0.0,0.0
6,13000,10,1300.0,국내산,yes,1.0,0.0,0.0
7,12000,10,1000.0,국내산,no,1.0,0.0,0.0
8,9900,10,1000.0,국내산,no,1.0,0.0,0.0


In [22]:
# 기존 컬럼 삭제
df = df.drop(['원산지'], axis=1)
df

Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


In [23]:
df

Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


## 5. 원핫(one-hot) 인코딩(심화)
* 여러 개 컬럼 한 번에 인코딩

※TIP
Jupyter Notebook으로 학습을 진행하실 경우
코드 에러시 Jupyter Notebook내 하단에서 에러 위치를 확인 할수 있습니다.
또한 코드에 ctrl + / 로 주석처리를 하며 데이터 값, 옵션 값 등을 출력 통해서 확인하면 빠르게 에러를 찾을 수 있습니다.

In [24]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [25]:
# 원핫인코딩
cols = df.select_dtypes(include='object').columns
ohe = OneHotEncoder(sparse=False)
cat = ohe.fit_transform(df[cols])
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
#컬럼의 이름까지 쓰기
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0,1.0
7,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,1.0,0.0


In [26]:
# 데이터프레임 합치기 (인코딩한 항을 합치기)
df = pd.concat([df, df_cat], axis=1)
df.head()

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0,0.0,1.0


In [27]:
# 컬럼 삭제
df = df.drop(cols, axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,1.0,0.0,0.0,1.0,0.0


## 원핫인코딩: 판다스 활용

In [28]:
# 활용할 데이터 선택
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df.head()

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes


In [29]:
# 판다스를 활용한 원핫 인코딩
df = pd.get_dummies(df)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1,0,0,1,0
1,15000,12,1500.0,0,0,1,0,1
2,14000,9,1600.0,0,1,0,0,1
3,14000,9,1800.0,1,0,0,0,1
4,14000,11,1300.0,0,0,1,0,1
5,13000,10,1400.0,1,0,0,0,1
6,13000,10,1300.0,1,0,0,0,1
7,12000,10,1000.0,1,0,0,1,0
8,9900,10,1000.0,1,0,0,1,0


## 데이터 전처리 : 수치형 데이터

## 표준화 (StandardScaler) - 모든값을 평균이 0, 분산이 1인 표준정규분포로 변환
z = (x−μ) / σ

In [30]:
# 원핫인코딩된 데이터
df.head()

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1,0,0,1,0
1,15000,12,1500.0,0,0,1,0,1
2,14000,9,1600.0,0,1,0,0,1
3,14000,9,1800.0,1,0,0,0,1
4,14000,11,1300.0,0,0,1,0,1


In [31]:
# StandardScaler(표준화)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(df[['가격']])

array([[ 1.54246993],
       [ 0.94150762],
       [ 0.34054531],
       [ 0.34054531],
       [ 0.34054531],
       [-0.260417  ],
       [-0.260417  ],
       [-0.86137931],
       [-2.12340016]])

In [32]:
# StandardScaler 여러 변수에 적용
cols = ['가격', '호수', '칼로리']
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.54247,0.848875,-0.57792,1,0,0,1,0
1,0.941508,1.940285,0.622376,0,0,1,0,1
2,0.340545,-1.333946,1.022475,0,1,0,0,1
3,0.340545,-1.333946,1.822672,1,0,0,0,1
4,0.340545,0.848875,-0.177822,0,0,1,0,1
5,-0.260417,-0.242536,0.222277,1,0,0,0,1
6,-0.260417,-0.242536,-0.177822,1,0,0,0,1
7,-0.861379,-0.242536,-1.378118,1,0,0,1,0
8,-2.1234,-0.242536,-1.378118,1,0,0,1,0


정규화 (MinMaxScaler)

모든값을 0~1 사이 값으로 변환 (데이터 사이즈 축소) 

In [33]:
# MinMaxScaler(정규화) - 이미 표준화가 되어있는걸 정규화함 (표준화만 해도되고 정규화만 해도 되고 표준화를 정규화 해도 되고 분석가 마음임)
from sklearn.preprocessing import MinMaxScaler
cols = ['가격', '호수', '칼로리']
scaler = MinMaxScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.666667,0.25,1,0,0,1,0
1,0.836066,1.0,0.625,0,0,1,0,1
2,0.672131,0.0,0.75,0,1,0,0,1
3,0.672131,0.0,1.0,1,0,0,0,1
4,0.672131,0.666667,0.375,0,0,1,0,1
5,0.508197,0.333333,0.5,1,0,0,0,1
6,0.508197,0.333333,0.375,1,0,0,0,1
7,0.344262,0.333333,0.0,1,0,0,1,0
8,0.0,0.333333,0.0,1,0,0,1,0


## 사이킷런에서 제공하는 데이터셋
이번시간 정리
1. 피처 이름 확인
dataset.feature_names

2. 타겟 확인
dataset.target

3. 데이터 확인
dataset.data[:2] * 2세트만 확인시

4. 데이터 프레임 만들기
-변수명= pd.DataFrame(data=dataset.data , columns=dataset.feature_names)

5. 타겟 추가
cancer_df['target'] = dataset.target

6. 데이터셋 확인
load_로 시작하는 함수를 이용해 데이터를 불러올 수 있다.
import sklearn.datasets
sklearn.datasets.__all__ * _언더바가 연달아 2개
※TIP
입문자일수록 셀마다 변수명.head()로 하나씩 확인하면서 넘어가는 것을 추천드립니다.

## 유방암 데이터로 연습

In [34]:
# 사이킷런 데이터 불러오기
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
dataset

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [35]:
# 피처 이름 확인
dataset.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [36]:
# 타겟 확인
dataset.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [37]:
# 데이터 확인
dataset.data[:2]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02]])

In [38]:
# 데이터프레임으로 만들기
cancer_df = pd.DataFrame(data=dataset.data , columns=dataset.feature_names)
cancer_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [39]:
# 타겟 추가하기
cancer_df['target'] = dataset.target
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [40]:
# 데이터셋 확인 (load_*)
import sklearn.datasets
sklearn.datasets.__all__ #언더바가 두개인 것 확인하기 + 언더바사

['clear_data_home',
 'dump_svmlight_file',
 'fetch_20newsgroups',
 'fetch_20newsgroups_vectorized',
 'fetch_lfw_pairs',
 'fetch_lfw_people',
 'fetch_olivetti_faces',
 'fetch_species_distributions',
 'fetch_california_housing',
 'fetch_covtype',
 'fetch_rcv1',
 'fetch_kddcup99',
 'fetch_openml',
 'get_data_home',
 'load_boston',
 'load_diabetes',
 'load_digits',
 'load_files',
 'load_iris',
 'load_breast_cancer',
 'load_linnerud',
 'load_sample_image',
 'load_sample_images',
 'load_svmlight_file',
 'load_svmlight_files',
 'load_wine',
 'make_biclusters',
 'make_blobs',
 'make_circles',
 'make_classification',
 'make_checkerboard',
 'make_friedman1',
 'make_friedman2',
 'make_friedman3',
 'make_gaussian_quantiles',
 'make_hastie_10_2',
 'make_low_rank_matrix',
 'make_moons',
 'make_multilabel_classification',
 'make_regression',
 'make_s_curve',
 'make_sparse_coded_signal',
 'make_sparse_spd_matrix',
 'make_sparse_uncorrelated',
 'make_spd_matrix',
 'make_swiss_roll']

# 2. 당뇨병 데이터


In [41]:
# 당뇨병 데이터 불러오기
from sklearn.datasets import load_diabetes
dataset = load_diabetes()

In [45]:
diabetes_df = pd.DataFrame(data=dataset.data , columns = dataset.feature_names)
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [47]:
diabetes_df['target']=dataset.target
diabetes_df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


## 이번 시간 정리
1. train_test_split
-사이킷런의 train_test_split 라이브러리를 활용하면 쉽게 train set(학습데이터셋)과 테스트셋을 랜덤하게 나누어 준다.

2. test_size=0.3
-학습용으로 70%를 두고 30% 데이터는 테스트용으로 둔다.
- x train, x test, y train, y test data가 4개 만들어짐 (순서 알아두기)

3. random_state
-반복 연산시 일정하게 섞기 위해 사용하며 random값을 고정하여 동일한 결과를 얻을 수 있다.

In [48]:
#1) 검증데이터 분리

# 검증데이터 분리 (model selection 안에 존재)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    cancer_df.drop('target', axis=1), cancer_df['target'], test_size=0.3, random_state=1004)
#Random split 하는 이유 - model 평가를 하기 위하여

In [49]:
# train 데이터 확인(X)
X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
350,11.66,17.07,73.7,421.0,0.07561,0.0363,0.008306,0.01162,0.1671,0.05731,...,13.28,19.74,83.61,542.5,0.09958,0.06476,0.03046,0.04262,0.2731,0.06825
384,13.28,13.72,85.79,541.8,0.08363,0.08575,0.05077,0.02864,0.1617,0.05594,...,14.24,17.37,96.59,623.7,0.1166,0.2685,0.2866,0.09173,0.2736,0.0732
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
557,9.423,27.88,59.26,271.3,0.08123,0.04971,0.0,0.0,0.1742,0.06059,...,10.49,34.24,66.5,330.6,0.1073,0.07158,0.0,0.0,0.2475,0.06969
49,13.49,22.3,86.91,561.0,0.08752,0.07698,0.04751,0.03384,0.1809,0.05718,...,15.15,31.82,99.0,698.8,0.1162,0.1711,0.2282,0.1282,0.2871,0.06917


In [50]:
# 학습 데이터 확인(y)
y_train.head()

350    1
384    1
7      0
557    1
49     1
Name: target, dtype: int64

In [51]:
# 데이터 크기 (순서 알아두기, 아무 것도 없는 것은 하나라는 말)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

In [52]:
# 2) 의사결정나무 (DecisionTree)

# 머신러닝(분류)
from sklearn.tree import DecisionTreeClassifier

# 모델 선택
model = DecisionTreeClassifier()
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)
pred

array([1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [53]:
# 3) 평가 (accuracy)

# 정확도 accuracy_score(실제값, 예측값)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)
#10개 중에서 9개를 맞는 성능임

0.9064327485380117

In [54]:
# 1) 검증데이터 분리

# 검증데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_df.drop('target', axis=1), diabetes_df['target'], test_size=0.3, random_state=1004)

In [55]:
# train 데이터 확인(X)
X_train.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
232,0.012648,0.05068,0.000261,-0.011409,0.03971,0.057245,-0.039719,0.056081,0.024053,0.032059
81,0.012648,0.05068,-0.022373,-0.029771,0.010815,0.028435,-0.021311,0.034309,-0.00608,-0.001078
227,0.067136,0.05068,-0.029918,0.057449,-0.000193,-0.015719,0.074412,-0.050564,-0.038459,0.007207
334,-0.060003,0.05068,-0.047163,-0.022885,-0.071743,-0.057681,-0.006584,-0.039493,-0.062913,-0.054925
83,-0.038207,-0.044642,0.009961,-0.046985,-0.059359,-0.052983,-0.010266,-0.039493,-0.015998,-0.042499


In [56]:
# 학습 데이터 확인(y)
y_train.head()

232    259.0
81      51.0
227    108.0
334     72.0
83     210.0
Name: target, dtype: float64

In [57]:
# 데이터 크기
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((309, 10), (133, 10), (309,), (133,))

In [58]:
# 2) 선형회귀 (LinearRegression)
# 머신러닝(회귀)
from sklearn.linear_model import LinearRegression

# 모델 선택
model = LinearRegression()
# 학습
model.fit(X_train, y_train)
# 예측
pred = model.predict(X_test)
pred

array([128.42584096, 135.0870178 , 154.78166657, 106.09435851,
       149.54313889, 165.32784413, 140.28955883, 167.95778147,
        69.4503107 , 189.42664051, 144.9847161 , 150.11273979,
       211.16078839, 108.59060526, 160.38284395, 154.09983174,
       198.78261008, 101.15867269, 165.82927955, 196.03539157,
       147.16334011, 108.25672705, 207.28165472, 127.53118326,
       181.85939663, 228.49956035,  86.59826865, 155.35000645,
       182.49645024, 182.96618563, 207.62865615, 199.81160132,
       157.36018064,  61.82420828,  88.11382212,  95.75044874,
       134.03529608,  85.95868919, 177.696808  , 249.40294124,
       171.77254108, 204.7152808 , 239.12264861,  89.25163253,
       244.20493803, 180.3631026 , 161.10419087, 119.1476007 ,
       102.82444201, 153.35453384, 129.06516018, 208.80060788,
       107.93804188, 138.812454  , 166.04360408,  86.04372546,
       181.03333757, 117.98035042,  79.04611318, 127.20684737,
       194.06146795, 205.13560483, 233.85068191,  48.53

In [60]:
# 3) 평가 (MSE)

# 정확도 mean_squared_error(실제값, 예측값)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)

3229.4396061827333