# 인공 신경망_분류

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

#파일 불러오기
df_raw = pd.read_csv("../data/유방암.csv", engine='python', encoding='cp949')
df_raw.head()

df_raw.isnull().sum(axis = 0) #결측치 확인 및 처리

# diagnosis 변수 "양성" or "음성"을 1과 0으로 바꾼다.
df_raw["diagnosis"] = np.where(df_raw["diagnosis"] == "양성", 0, 1)
df_raw.head()

# 목표 변수 diagnosis를 y로 설정하고 나머지 변수들을 x로 나눈다.
df_raw_y = df_raw["diagnosis"]
df_raw_x = df_raw.drop("diagnosis", axis = 1, inplace = False)
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y, test_size = 0.3, random_state = 1234)

nn_uncustomized = MLPClassifier(random_state=1234)
nn_uncustomized.fit(df_train_x, df_train_y)
print("Accuracy on training set: {:.3f}".format(nn_uncustomized.score(df_train_x, df_train_y)))
print("Accuracy on test set: {:.3f}".format(nn_uncustomized.score(df_test_x, df_test_y)))

## 임의로 데이터를 트레이닝 6 : 테스트 4으로 나누었을 때,
## 트레이닝 데이터셋의 정확도는 92.4%
## 테스트 데이터셋의 정확도는 89.6%로 측정된다.

nn_uncustomized

# 은닉층(Hidden Layer)
## 은닉층 개수와 층별 노드 개수를 지정하여 은닉층의 복잡도를 결정한다.
## => 은닉층이 복잡할수록 모델의 과대적합 경향이 나타난다.
## Hidden Layer(은닉층) 변경

train_accuracy = []
test_accuracy = []
para_hidden = [20 * hidden for hidden in range(1, 9)]

for v_hidden in para_hidden:
    nn = MLPClassifier(hidden_layer_sizes=v_hidden, random_state=1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_train_x, df_train_y))
    test_accuracy.append(nn.score(df_test_x, df_test_y))

df_accuracy_hidden = pd.DataFrame()
df_accuracy_hidden["HiddenLayer"] = para_hidden
df_accuracy_hidden["TrainAccuracy"] = train_accuracy
df_accuracy_hidden["TestAccuracy"] = test_accuracy

df_accuracy_hidden.round(3)

## HiddenLayer가 100일 때 최대의 정확도가 측정된다.

plt.plot(para_hidden, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_hidden, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Hidden Layer")
plt.legend()

## Activation(활성화 함수) 변경

train_accuracy = []
test_accuracy = []
para_function = ["logistic", "tanh", "relu"]

for v_function in para_function:
    nn = MLPClassifier(activation=v_function, random_state=1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_train_x, df_train_y))
    test_accuracy.append(nn.score(df_test_x, df_test_y))

df_accuracy_function = pd.DataFrame()
df_accuracy_function["ActivationFunction"] = para_function
df_accuracy_function["TrainAccuracy"] = train_accuracy
df_accuracy_function["TestAccuracy"] = test_accuracy

df_accuracy_function.round(3)

# 활성화 함수(Activation Function)

## 입력층 또는 이전의 은닉층의 정보를 다음 층으로의 출려값을 결정하는 함수
## 3가지의 활성화 함수를 적용했을 때, tanh 함수가 가장 정확도가 높았다.

plt.plot(para_function, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_function, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Activation Function")
plt.legend()

v_feature_name = df_train_x.columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw_x)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_name)
df_scaled.head()

## 설명변수의 구간을 동일하게 변경(스케일링)한다.
## => 트레이닝 셋과 테스트 셋의 사이즈를 맞추어 준다.

df_scaled_train_x, df_scaled_test_x = train_test_split(df_scaled, test_size = 0.3, random_state = 1234)
print("train data X size : {}".format(df_scaled_train_x.shape))
print("test data X size : {}".format(df_scaled_test_x.shape))

nn_scaled = MLPClassifier(random_state=1234)
nn_scaled.fit(df_scaled_train_x, df_train_y)

print("Accuracy on training set: {:.3f}".format(nn_scaled.score(df_scaled_train_x, df_train_y)))
print("Accuracy on test set: {:.3f}".format(nn_scaled.score(df_scaled_test_x, df_test_y)))

## 스케일링 이후에 정확도를 측정하면 이전의 정확도에 비해
## 증가한 정확도를 얻을 수 있다.

## 스케일링 이후에 은닉층을 다시 변경하여 적용한다.

train_accuracy = []
test_accuracy = []
para_hidden = [20 * hidden for hidden in range(1, 11)]

for v_hidden in para_hidden:
    nn = MLPClassifier(hidden_layer_sizes=(v_hidden, v_hidden), random_state=1234)
    nn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(nn.score(df_scaled_test_x, df_test_y))

df_accuracy_hidden = pd.DataFrame()
df_accuracy_hidden["HiddenLayer"] = para_hidden
df_accuracy_hidden["TrainAccuracy"] = train_accuracy
df_accuracy_hidden["TestAccuracy"] = test_accuracy

df_accuracy_hidden.round(3)

plt.plot(para_hidden, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_hidden, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Hidden Layer")
plt.legend()

train_accuracy = []
test_accuracy = []
para_function = ["logistic", "tanh", "relu"]

for v_function in para_function:
    nn = MLPClassifier(activation=v_function, hidden_layer_sizes = (120,120), random_state=1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(nn.score(df_scaled_test_x, df_test_y))

df_accuracy_function = pd.DataFrame()
df_accuracy_function["ActivationFunction"] = para_function
df_accuracy_function["TrainAccuracy"] = train_accuracy
df_accuracy_function["TestAccuracy"] = test_accuracy

df_accuracy_function.round(3)

## relu 활성화 함수의 정확도가 가장 높게 나왔다.

plt.plot(para_function, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_function, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Activation Function")   
plt.legend()

# 가중치 최적화
## 신경망 모델의 학습 결과에 따른  손실함수 값을 최소화하는 하이퍼 파라미터 탐색 및 최적화

train_accuracy = []
test_accuracy = []
para_solver = ["lbfgs", "sgd", "adam"]

for v_solver in para_solver:
    nn = MLPClassifier(solver = v_solver, activation="relu", hidden_layer_sizes = (120,120), random_state=1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(nn.score(df_scaled_test_x, df_test_y))

df_accuracy_solver = pd.DataFrame()
df_accuracy_solver["Solver"] = para_solver
df_accuracy_solver["TrainAccuracy"] = train_accuracy
df_accuracy_solver["TestAccuracy"] = test_accuracy

df_accuracy_solver.round(3)

## => "adam" 모델이 가장 정확도가 높은 것을 확인했다.

plt.plot(para_solver, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_solver, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Solver")
plt.legend()

# 미니배치
## 전체 학습 데이터를 배치 크기로 등분하여 각 배치 데이터를 순차적으로 실행하는 방법

train_accuracy = []
test_accuracy = []
para_batch = [20 * batch for batch in range(1, 10)]

for v_batch in para_batch:
    nn = MLPClassifier(batch_size=v_batch, solver = "adam", activation="relu", hidden_layer_sizes = (120,120), random_state=1234)
    nn.fit(df_train_x, df_train_y)
    train_accuracy.append(nn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(nn.score(df_scaled_test_x, df_test_y))

df_accuracy_batch = pd.DataFrame()
df_accuracy_batch["Mini Batch"] = para_batch
df_accuracy_batch["TrainAccuracy"] = train_accuracy
df_accuracy_batch["TestAccuracy"] = test_accuracy

df_accuracy_batch.round(3)

## => mini batch의 크기를 20으로 했을 때 가장 높은 정확도를 기록했다.

plt.plot(para_batch, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_batch, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Mini Batch Size")
plt.legend()

df_accuracy_batch.round(3)

plt.plot(para_batch, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_batch, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("Mini Batch Size")
plt.legend()

nn_final = MLPClassifier(hidden_layer_sizes=(120, 120), activation="relu", solver="adam",
                        batch_size=20, random_state= 1234)
nn_final.fit(df_scaled_train_x, df_train_y)
y_pred = nn_final.predict(df_scaled_test_x)
print("Accuracy on training set: {:.3f}".format(nn_final.score(df_scaled_train_x, df_train_y)))
print("Accuracy on test set: {:.3f}".format(nn_final.score(df_scaled_test_x, df_test_y)))
print("Confusion matrix: \n{}".format(confusion_matrix(df_test_y, y_pred)))

# 결론 : 트레이닝 데이터 정확도 : 100.0%
# 테스트 데이터 정확도 : 97.9%
# 정분류율 = (24 + 70) / (24 + 70 + 0 + 2) = 94.79%

# =======================================

In [3]:
# KNN(K-Nearest Neighbors) 분석
## 거리 기반으로 이웃을 결정하며 새로운 사건이 발생했을 때 가장 근접한 k-이웃의 값을 평균해서 예측하거나 빈도가 많은 클래스로 분류하는 탐색적 기법

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

#파일 불러오기
df_raw = pd.read_csv("../data/IRIS.csv", engine='python', encoding='cp949')
df_raw.head()

df_raw.isnull().sum(axis = 0) #결측치 확인 및 처리

# diagnosis 변수 "양성" or "음성"을 1과 0으로 바꾼다.
df_raw["c"] = np.where(df_raw["SPECIES"] == "setosa", 0, 1)
df_raw["SPECIE"] = np.where(df_raw["SPECIES"] == "versicolor", 1, 2)
df_raw.head()

# 목표 변수 diagnosis를 y로 설정하고 나머지 변수들을 x로 나눈다.
df_raw_y = df_raw["species"]
df_raw_x = df_raw.drop("species", axis = 1, inplace = False)
df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_raw_x, df_raw_y, test_size = 0.3, random_state = 1234)

# default 값으로 분류한 결과
## 트레이닝 데이터 정확도 : 92.2%
## 테스트 데이터 정확도 : 90.6%

knn_uncustomized = KNeighborsClassifier()
knn_uncustomized.fit(df_train_x, df_train_y)
print("Accuracy on training set: {:.3f}".format(nn_uncustomized.score(df_train_x, df_train_y)))
print("Accuracy on test set: {:.3f}".format(nn_uncustomized.score(df_test_x, df_test_y)))

knn_uncustomized

# n_neighbors : 이웃 수
## 이웃 수가 작으면 과대적합의 위험이 증가한다.
## 이웃 수가 크면 과소적합될 가능성이 증가한다.

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 6)]

for v_n_neighbors in para_n_neighbors:
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors)
    knn.fit(df_train_x, df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_test_x, df_test_y))

df_accuracy_neighbors = pd.DataFrame()
df_accuracy_neighbors["Neighbors"] = para_n_neighbors
df_accuracy_neighbors["TrainAccuracy"] = train_accuracy
df_accuracy_neighbors["TestAccuracy"] = test_accuracy
df_accuracy_neighbors.round(3)

## 가장 높은 정확도를 기록한 neighbors 값은 7이다.

plt.plot(para_n_neighbors, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_n_neighbors, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("n_neighbors")
plt.legend()

# Weights : 예측에 사용되는 가중 함수
## uniform : 균일한 가중치
## disance : 거리의 역수로 가중치 부여, 가까운 이웃들 간에는 큰 영향값을 가진다.

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 31)] * 2 
para_weights = (["uniform"] * 28) + (["distance"] * 28)

for (v_n_neighbors, v_weights) in zip(para_n_neighbors, para_weights):
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors, weights=v_weights)
    knn.fit(df_train_x, df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_test_x, df_test_y))

df_accuracy_weights = pd.DataFrame()
df_accuracy_weights["Neighbors"] = para_n_neighbors
df_accuracy_weights["Weights"] = para_weights
df_accuracy_weights["TrainAccuracy"] = train_accuracy
df_accuracy_weights["TestAccuracy"] = test_accuracy
df_accuracy_weights.round(3)

## uniform 일 때는 이웃 수가 7일 때 정확도가 가장 높다
## distance 일 때는 6과 10일 때 가장 높다는 것을 확인할 수 있다.

df_accuracy_weights_pivot = df_accuracy_weights.pivot(index = "Neighbors", columns = "Weights",
                                                     values=["TrainAccuracy", "TestAccuracy"])
df_accuracy_weights_pivot

level0 = df_accuracy_weights_pivot.columns.get_level_values(0)
level1 = df_accuracy_weights_pivot.columns.get_level_values(1)
df_accuracy_weights_pivot.columns = level0 + "_" + level1
df_accuracy_weights_pivot.head()

sns.lineplot(data = df_accuracy_weights_pivot)

# 거리 계산 방법별 이웃 수 증가에 따른 정확도 변화

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 31)] * 3
para_metric = ["minkowski"] * 28 + ["euclidean"] * 28 + ["manhattan"] * 28

for (v_n_neighbors, v_metric) in zip(para_n_neighbors, para_metric):
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors, weights="uniform", metric=v_metric)
    knn.fit(df_train_x, df_train_y)
    train_accuracy.append(knn.score(df_train_x, df_train_y))
    test_accuracy.append(knn.score(df_test_x, df_test_y))

df_accuracy_metric = pd.DataFrame()
df_accuracy_metric["Neighbors"] = para_n_neighbors
df_accuracy_metric["Metric"] = para_metric
df_accuracy_metric["TrainAccuracy"] = train_accuracy
df_accuracy_metric["TestAccuracy"] = test_accuracy
df_accuracy_metric.groupby("Metric").max().round(3)

## 거리 계산 방법 3가지의 최대 정확도 값을 보았을 때
## 모두 같은 값을  갖는다.
## 트레이닝 정확도를 보았을 때 manhattan 방법이 가장 높은 정확도를 가지기 때문에 manhattan 방법을 채택했다.

df_accuracy_metric_pivot = df_accuracy_metric.pivot(index = "Neighbors", columns = "Metric",
                                                     values=["TrainAccuracy", "TestAccuracy"])
level0 = df_accuracy_metric_pivot.columns.get_level_values(0)
level1 = df_accuracy_metric_pivot.columns.get_level_values(1)
df_accuracy_metric_pivot.columns = level0 + "_" + level1
sns.lineplot(data = df_accuracy_metric_pivot)

## Scale 변경

v_feature_name = df_train_x.columns
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_raw_x)
df_scaled = pd.DataFrame(df_scaled, columns=v_feature_name)
df_scaled.head()

df_scaled_train_x, df_scaled_test_x = train_test_split(df_scaled, test_size = 0.4, random_state =1234)
print("train data X size : {}".format(df_scaled_train_x.shape))
print("test data X size : {}".format(df_scaled_test_x.shape))

knn_scaled = KNeighborsClassifier()
knn_scaled.fit(df_scaled_train_x, df_train_y)

print("Accuracy on training set: {:.3f}".format(knn_scaled.score(df_scaled_train_x, df_train_y)))
print("Accuracy on test set: {:.3f}".format(knn_scaled.score(df_scaled_test_x, df_test_y)))

## 스케일링을 한 이후에 정확도는
## 트레이닝 95.8%, 테스트 92.2%이다.

## 이전의 92.2% 90.6%보다 향상되었다는 것을 알 수 있다.

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 31)]

for v_n_neighbors in para_n_neighbors:
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors)
    knn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(knn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(knn.score(df_scaled_test_x, df_test_y))

df_accuracy_neighbors = pd.DataFrame()
df_accuracy_neighbors["Neighbors"] = para_n_neighbors
df_accuracy_neighbors["TrainAccuracy"] = train_accuracy
df_accuracy_neighbors["TestAccuracy"] = test_accuracy
df_accuracy_neighbors.round(3)

## neighbors의 값을 3부터 30까지 변경하면서 정확도를 비교했을 때,
## n_neighbors가 7일 때 test 정확도가 가장 높다는 것을 확인했다.

plt.plot(para_n_neighbors, train_accuracy, linestyle = "-", label = "Train Accuracy")
plt.plot(para_n_neighbors, test_accuracy, linestyle = "--", label = "Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("n_neighbors")
plt.legend()

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 31)] * 2 
para_weights = (["uniform"] * 28) + (["distance"] * 28)

for (v_n_neighbors, v_weights) in zip(para_n_neighbors, para_weights):
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors, weights=v_weights)
    knn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(knn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(knn.score(df_scaled_test_x, df_test_y))

df_accuracy_weights = pd.DataFrame()
df_accuracy_weights["Neighbors"] = para_n_neighbors
df_accuracy_weights["Weights"] = para_weights
df_accuracy_weights["TrainAccuracy"] = train_accuracy
df_accuracy_weights["TestAccuracy"] = test_accuracy
df_accuracy_weights.round(3)

df_accuracy_weights_pivot = df_accuracy_weights.pivot(index = "Neighbors", columns = "Weights",
                                                     values=["TrainAccuracy", "TestAccuracy"])
df_accuracy_weights_pivot

level0 = df_accuracy_weights_pivot.columns.get_level_values(0)
level1 = df_accuracy_weights_pivot.columns.get_level_values(1)
df_accuracy_weights_pivot.columns = level0 + "_" + level1
df_accuracy_weights_pivot.head()

sns.lineplot(data = df_accuracy_weights_pivot)

# 거리 계산 방법별 정확도

train_accuracy = []
test_accuracy = []
para_n_neighbors = [i for i in range(3, 31)] * 3
para_metric = ["minkowski"] * 28 + ["euclidean"] * 28 + ["manhattan"] * 28

for (v_n_neighbors, v_metric) in zip(para_n_neighbors, para_metric):
    knn = KNeighborsClassifier(n_neighbors = v_n_neighbors, weights="uniform", metric=v_metric)
    knn.fit(df_scaled_train_x, df_train_y)
    train_accuracy.append(knn.score(df_scaled_train_x, df_train_y))
    test_accuracy.append(knn.score(df_scaled_test_x, df_test_y))

df_accuracy_metric = pd.DataFrame()
df_accuracy_metric["Neighbors"] = para_n_neighbors
df_accuracy_metric["Metric"] = para_metric
df_accuracy_metric["TrainAccuracy"] = train_accuracy
df_accuracy_metric["TestAccuracy"] = test_accuracy
df_accuracy_metric.round(3)

df_accuracy_metric.groupby("Metric").max().round(3)

## 세 가지 방법의 최대 정확도 값을 구했을 때
## manhattan 방법이 가장 높은 정확도를 기록했다는 것을 알 수 있다.

df_accuracy_metric_pivot = df_accuracy_metric.pivot(index = "Neighbors", columns = "Metric",
                                                     values=["TrainAccuracy", "TestAccuracy"])
level0 = df_accuracy_metric_pivot.columns.get_level_values(0)
level1 = df_accuracy_metric_pivot.columns.get_level_values(1)
df_accuracy_metric_pivot.columns = level0 + "_" + level1
sns.lineplot(data = df_accuracy_metric_pivot)

knn_model = KNeighborsClassifier(n_neighbors=5, weights = "uniform", metric = "manhattan")
knn_model.fit(df_train_x, df_train_y)

y_pred = knn_model.predict(df_test_x)

print("train data accuracy: {0:.3f}".format(knn_model.score(df_train_x, df_train_y)))
print("test data accuracy: {0:.3f}".format(knn_model.score(df_test_x, df_test_y)))
print("Confusion matrix: \n{}".format(confusion_matrix(df_test_y, y_pred)))

# 결론 :
## 트레이닝 정확도는 94.3%
## 테스트 정확도는 93.8%
## 정분류율은 (30 + 90) / (30 + 90 + 1 + 7) * 100 = 93.75%

KeyError: 'species'