# Classification 예시


In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from matplotlib import style
import warnings
warnings.filterwarnings('ignore')

## 와인 등급 분류데이터


**Data Set Characteristics:**  

    :Number of Instances: 178

    :Number of Attributes: 13

    :Attribute Information (in order):
        - Alcohol    
        - Malic acid      
        - Ash
        - Alcalinity of ash
        - Megnesium
        - Total phenols
        - Flavanoids
        - Nonflavonoid phenols
        - Proanthocyanins
        - Color intensity
        - Hue
        - OD280/OD315 of diluted wines
        - Proline
      - class:
        - class_0
        - class_1
        - class_2

In [None]:
import pandas as pd
from sklearn.datasets import load_wine

dataset = load_wine()
dataframe = pd.DataFrame(dataset.data, columns = dataset.feature_names)
dataframe['target'] = dataset.target

In [None]:
dataframe.head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0


In [None]:
dataframe.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [None]:
from sklearn.model_selection import train_test_split

X = dataframe.loc[:,dataframe.columns != 'target']
y = dataframe['target']
#train = 0.8, test 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1234)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()


In [None]:
lr.fit(X_train,y_train)
lr_y_pred = lr.predict(X_test)

In [None]:
np.array(y_test).shape

In [None]:
from sklearn.metrics import confusion_matrix

confusion_df = pd.DataFrame(confusion_matrix(y_test, lr_y_pred), columns=["0", "1", "2"], index=["0", "1", "2"])
confusion_df

In [None]:
lr_y_pred

In [None]:
# accuracy, precision, recall, f1-score 출력
# "macro"는 다중분류 모델 평가를 위해 각 레이블에 대한 metric 값을 계산 후 평균 점수를 내주게 함
from sklearn import metrics

print(" logistic regression accuracy: {:.4f}".format(metrics.accuracy_score(y_test,lr_y_pred)))
print(" logistic regression precision: {:.4f}".format(metrics.precision_score(y_test,lr_y_pred,average="macro")))
print(" logistic regression recall: {:.4f}".format(metrics.recall_score(y_test,lr_y_pred,average="macro")))
print(" logistic regression f1 score: {:.4f}".format(metrics.f1_score(y_test,lr_y_pred,average="macro")))

In [None]:
# 가중치와 절편
print(lr.coef_, lr.intercept_)

In [None]:
# ROC
from sklearn.metrics import roc_curve

# Label 1에 대한 ROC_curve 계산 (레이블 1이 아닌 나머지는 모두 0으로 처리)
fpr_rf, tpr_rf, thresholds_rf = roc_curve(np.where(y_test != 1, 0, 1), lr.predict_proba(X_test)[:,1])

plt.plot(fpr_rf, tpr_rf, label="ROC")
plt.xlabel('FPR')
plt.ylabel('TPR (Recall)')

plt.legend(loc=4)

### Scaling 처리 후 학습 진행

**실습문제**  
StandardScaler 적용 후 학습 진행

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# 1. X feature에 대해 scaler 적용
# 3. train dataset으로 LogisticRegression 학습 진행
# 4. test dataset으로 LogisticRegression 예측 진행
# 5. Confusion matrix 확인
# 6. accuracy, precision, recall, f1 score 확인

## DecisionTree

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth =  5)

In [None]:
dt.fit(X_train,y_train)
y_pred  = dt.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred), columns=["0", "1", "2"], index=["0", "1", "2"])
confusion_df

In [None]:
# accuracy, precision, recall, f1-score 출력
# "macro"는 다중분류 모델 평가를 위해 각 레이블에 대한 metric 값을 계산 후 평균 점수를 내주게 함
from sklearn import metrics

print(" Decision Tree accuracy: {:.4f}".format(metrics.accuracy_score(y_test,y_pred)))
print(" Decision Tree precision: {:.4f}".format(metrics.precision_score(y_test,y_pred,average="macro")))
print(" Decision Tree recall: {:.4f}".format(metrics.recall_score(y_test,y_pred,average="macro")))
print(" Decision Tree f1 score: {:.4f}".format(metrics.f1_score(y_test,y_pred,average="macro")))

### Scaling 처리 후 학습 진행

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() #평균을 0, 분산을 1로 조정해주는 scaler
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
dt.fit(X_train_scaled,y_train)
y_pred  = dt.predict(X_test_scaled)

In [None]:
confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred), columns=["0", "1", "2"], index=["0", "1", "2"])
confusion_df

In [None]:
# accuracy, precision, recall, f1-score 출력
# "macro"는 다중분류 모델 평가를 위해 각 레이블에 대한 metric 값을 계산 후 평균 점수를 내주게 함
from sklearn import metrics

print(" Decision Tree(with Scaling) accuracy: {:.4f}".format(metrics.accuracy_score(y_test,y_pred)))
print(" Decision Tree(with Scaling) precision: {:.4f}".format(metrics.precision_score(y_test,y_pred,average="macro")))
print(" Decision Tree(with Scaling) recall: {:.4f}".format(metrics.recall_score(y_test,y_pred,average="macro")))
print(" Decision Tree(with Scaling) f1 score: {:.4f}".format(metrics.f1_score(y_test,y_pred,average="macro")))

### Depth를 다르게 주어 진행

In [None]:
# 실습 문제
# DecisionTree에 대한 max_depth를 다르게 주면서 결과 확인해보기

### Random Forest

In [None]:
# decision Tree의 overfitting 단점을 보완한 앙상블 학습 기법
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=20, max_depth=5,random_state=0)
#n_estimator = 랜덤포레스트 안의 결정트리 갯수
#max_depth = 최대 깊이

In [None]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_df = pd.DataFrame(confusion_matrix(y_test, y_pred), columns=["0", "1", "2"], index=["0", "1", "2"])
confusion_df

In [None]:
# accuracy, precision, recall, f1-score 출력
# "macro"는 다중분류 모델 평가를 위해 각 레이블에 대한 metric 값을 계산 후 평균 점수를 내주게 함
from sklearn import metrics

print(" Random Forest accuracy: {:.4f}".format(metrics.accuracy_score(y_test,y_pred)))
print(" Random Forest precision: {:.4f}".format(metrics.precision_score(y_test,y_pred,average="macro")))
print(" Random Forest recall: {:.4f}".format(metrics.recall_score(y_test,y_pred,average="macro")))
print(" Random Forest f1 score: {:.4f}".format(metrics.f1_score(y_test,y_pred,average="macro")))