In [54]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import multiprocessing

from sklearn.svm import SVC, SVR
from sklearn.manifold import TSNE
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.datasets import load_boston, load_breast_cancer,load_iris
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import metrics

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
#### 나이브베이스 분류기

2진 일때는 베르누이 분류기 그 이상일때는 가우시안
모든 특성들이 독립임을 가정 (다중공선성을 확인해줘야함)


In [2]:
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

In [5]:
df = pd.DataFrame( [weather,temp])
df = df.T
df.columns = ['날씨', '온도']
df['외출여부'] = play
df

Unnamed: 0,날씨,온도,외출여부
0,Sunny,Hot,No
1,Sunny,Hot,No
2,Overcast,Hot,Yes
3,Rainy,Mild,Yes
4,Rainy,Cool,Yes
5,Rainy,Cool,No
6,Overcast,Cool,Yes
7,Sunny,Mild,No
8,Sunny,Cool,Yes
9,Rainy,Mild,Yes


### 특성데이터와 라벨데이터 추출


In [8]:
x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

### 문자열 인코딩 label

In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

In [13]:
ct = make_column_transformer((OneHotEncoder(), ['날씨','온도']))

In [15]:
ct.get_feature_names()

['onehotencoder__x0_Overcast',
 'onehotencoder__x0_Rainy',
 'onehotencoder__x0_Sunny',
 'onehotencoder__x1_Cool',
 'onehotencoder__x1_Hot',
 'onehotencoder__x1_Mild']

In [14]:
result = ct.fit_transform(df)
result

array([[0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.]])

### 모델 적용

In [16]:
model_pipe = make_pipeline(ct, BernoulliNB())
model_pipe.fit(x_data, y_data)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['날씨', '온도'])],
                                   verbose=False)),
                ('bernoullinb',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],


In [20]:
# 테스트용 데이터프레임 만들기
x_test = pd.DataFrame([['Sunny','Hot']], columns=['날씨','온도'])
x_test

Unnamed: 0,날씨,온도
0,Sunny,Hot


In [21]:
model_pipe.predict(x_test)

array(['No'], dtype='<U3')

## 연습문제
- wine 데이터셋 나이브베이즈를 이용하여 test[0] 데이터에 대한 분류를 출력하고, score를 구하시오

In [55]:
from sklearn.datasets import load_wine

In [56]:
wine = load_wine()
wine.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [57]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [58]:
x_data = wine['data']
y_data = wine['target'] 

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, 
                                                    test_size = 0.2, random_state = 11)

In [74]:
x_df = pd.DataFrame(x_train,columns = wine.feature_names)
x_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0
1,13.56,1.73,2.46,20.5,116.0,2.96,2.78,0.20,2.45,6.25,0.98,3.03,1120.0
2,13.78,2.76,2.30,22.0,90.0,1.35,0.68,0.41,1.03,9.58,0.70,1.68,615.0
3,13.58,2.58,2.69,24.5,105.0,1.55,0.84,0.39,1.54,8.66,0.74,1.80,750.0
4,11.03,1.51,2.20,21.5,85.0,2.46,2.17,0.52,2.01,1.90,1.71,2.87,407.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,14.75,1.73,2.39,11.4,91.0,3.10,3.69,0.43,2.81,5.40,1.25,2.73,1150.0
138,12.72,1.81,2.20,18.8,86.0,2.20,2.53,0.26,1.77,3.90,1.16,3.14,714.0
139,12.00,1.51,2.42,22.0,86.0,1.45,1.25,0.50,1.63,3.60,1.05,2.65,450.0
140,12.00,0.92,2.00,19.0,86.0,2.42,2.26,0.30,1.43,2.50,1.38,3.12,278.0


In [75]:
y_df = pd.DataFrame(y_train,columns=['target'])
y_df

Unnamed: 0,target
0,0
1,0
2,2
3,2
4,1
...,...
137,0
138,1
139,1
140,1


In [78]:
### 모델 적용
model_pipe = make_pipeline(StandardScaler(), GaussianNB())
model_pipe.fit(x_df, y_df)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

In [80]:
# 스코어
model_pipe.score(x_test, y_test)

1.0

In [79]:
# 예측
i = 0
model_pipe.predict([x_test[i]]), y_test[i]

(array([2]), 2)

### GridSerch 사용

In [89]:
model_pipe = make_pipeline( StandardScaler(), GaussianNB())
param_grid={"gaussiannb__var_smoothing":np.linspace(0.001, 1, 10)}
gridS = GridSearchCV(model_pipe, param_grid, scoring='f1_macro')
# model_pipe.fit(x_train, y_train)
gridS.fit(x_train, y_train)

print('최적의 파라미터 :',gridS.best_params_)
print('예측값:',gridS.best_estimator_.predict([x_test[0]]))
print('score:',gridS.best_score_)

최적의 파라미터 : {'gaussiannb__var_smoothing': 0.112}
예측값: [2]
score: 0.9724120461581762
