# 1. 당뇨병 데이터를 가지고 머신러닝 5가지 분류를 수행

(SVM, LR, RF, DT, KNN)

## 데이터 불러오기

In [3]:
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/diabetes.csv'
df = pd.read_csv(url)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## X, y 데이터 분할하기

In [6]:
X = df.drop('Outcome', axis = 1)
y = df['Outcome']

## test data set, train data set 분할

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25,random_state=0)

## 머신러닝 5가지 분류 수행

In [11]:
# 라이브러리 불러오기
from sklearn.svm import SVC # SVC
from sklearn.linear_model import LogisticRegression # LR
from sklearn.ensemble import RandomForestClassifier # RF
from sklearn.tree import DecisionTreeClassifier # DT
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# 모델들을 list를 통해 model에 입력
model = [SVC(random_state=0), LogisticRegression(random_state=0), 
         RandomForestClassifier(random_state=0), DecisionTreeClassifier(random_state=0), KNeighborsClassifier()]

# print 할때 모델들 구분을 위해 모델 이름을 적은 list 생성
model_name = ["SVC", "LR", "RF", "DT", "KNN"]

# for문을 통해 model에 저장된 각 요소들을 idk에는 index, function에는 model들을 차례로 넣는다.
# 반복될 때마다 new_model에 SVC, LR, RF, DT, KNN 모델들이 각각 들어가서 훈련을 하고 각 모델 별로 점수를 측정한다.
for idk, function in enumerate(model):
    new_model = function
    new_model.fit(X_train, y_train)
    pred = new_model.predict(X_test)
    print("모델 이름 = ", model_name[idk], ", 모델 accuracy_score = ", accuracy_score(y_test,pred))
    print ("confusion_matrix = ", confusion_matrix(y_test, pred))

모델 이름 =  SVC , 모델 accuracy_score =  0.7708333333333334
confusion_matrix =  [[119  11]
 [ 33  29]]
모델 이름 =  LR , 모델 accuracy_score =  0.7916666666666666
confusion_matrix =  [[115  15]
 [ 25  37]]
모델 이름 =  RF , 모델 accuracy_score =  0.7708333333333334
confusion_matrix =  [[116  14]
 [ 30  32]]
모델 이름 =  DT , 모델 accuracy_score =  0.71875
confusion_matrix =  [[102  28]
 [ 26  36]]
모델 이름 =  KNN , 모델 accuracy_score =  0.7552083333333334
confusion_matrix =  [[109  21]
 [ 26  36]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# 2. 동일한 데이터로 딥러닝 분류 수행하라. 
(dense layer 만 사용)

## 데이터 불러오기

In [16]:
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/diabetes.csv'
df2 = pd.read_csv(url)
df2

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## 데이터의 행과 열의 개수 확인하기(input_shape를 결정하기 위해)

In [19]:
df2.shape

(768, 9)

## feature data, target data 나누기

In [22]:
X2 = df2.drop('Outcome', axis = 1)
y2 = df2['Outcome']

## 1단계. 원 핫 인코딩 수행

In [25]:
Y2 = pd.get_dummies(y2).values
# 원 핫 인코딩으로 인해 target과 output 차원이 달라질 때 해결 방안 
# 1. target을 1차원 데이터로 변환한다. 모델 출력 차원 (None, 1), target 차원 (None, 2)
# 2. 모델의 출력층을 target에 맞게 수정한다.
# Y2 = np.argmax(Y2, axis=1) # 1번의 경우

## 2단계. 딥러닝 입력을 위해 numpy 변환

In [28]:
X2 = X2.values

## 3단계. test data set, train data set, val data set나누기

In [31]:
from sklearn.model_selection import train_test_split
# (train data set + val data set)와 test data set을 8:2로 나누기
X_temp, X_test2, y_temp, y_test2 = train_test_split(X2,Y2,test_size=0.2,random_state=0) 
# train data set와 val data set 8:2로 나누기
X_train2, X_val, y_train2, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)

## 4단계. 모델 만들기

In [34]:
# 라이브러리 import
from keras.models import Sequential
from keras.layers import Dense

# 순차모델 생성
# 모델과 target을 맞춰주기 위해 출력층을 2개로, loss를 categorical_crossentropy로 설정
model = Sequential() 

model.add(Dense(32,input_shape=(8,),activation='relu'))  # 입력에 들어가는 데이터는 8개이므로 input_shape는 8
model.add(Dense(32,activation='relu')) 
model.add(Dense(2,activation='softmax'))
# model.add(Dense(1,activation='sigmoid')) # 이진 분류 이므로 sigmoid 사용
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) 

# model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])  # 이진 분류이므로 binary_crossentropy 사용

model.summary() # 모델 정보 확인

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## 5단계. 모델 학습 및 평가

In [36]:
model_history=model.fit(x=X_train2, y=y_train2, epochs=30, batch_size=32, validation_data=(X_val, y_val))

# 모델 평가
test_loss, test_acc = model.evaluate(X_test2, y_test2)
print(f"테스트 정확도 : {test_acc}")

Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.3965 - loss: 4.4952 - val_accuracy: 0.5203 - val_loss: 1.2618
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5193 - loss: 1.7485 - val_accuracy: 0.5203 - val_loss: 1.1228
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4942 - loss: 1.8600 - val_accuracy: 0.5041 - val_loss: 2.2687
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5119 - loss: 1.7482 - val_accuracy: 0.6341 - val_loss: 0.8044
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5642 - loss: 1.2458 - val_accuracy: 0.4390 - val_loss: 2.1873
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5542 - loss: 1.6119 - val_accuracy: 0.4146 - val_loss: 2.8299
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━

# 3. 해당 데이터에서 Outcome을 삭제하고 BMI를 예측하는 회귀를 수행하라

## 데이터 불러오기

In [39]:
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/diabetes.csv'
df3 = pd.read_csv(url)
df3

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Outcome 삭제하기

In [41]:
df3 = df3.drop('Outcome', axis = 1)

## feature, target data 분리

In [43]:
X3 = df3.drop(['BMI'], axis=1) # outcome 삭제, bmi는 예측요소이므로 제거
y3 = df3['BMI'] # 예측하고자 하는 대상은 bmi

## train, test data set 분리

In [45]:
from sklearn.model_selection import train_test_split

X_train3, X_test3, y_train3, y_test3=train_test_split(X3, y3, test_size=0.2, shuffle=True, random_state=12)
print(X_train3.shape, y_train3.shape)
print(X_test3.shape, y_test3.shape)

(614, 7) (614,)
(154, 7) (154,)


## 모델 생성 및 정확도 측정

In [47]:
# 라이브러리 불러오기
from sklearn.svm import SVR # SVR
from sklearn.linear_model import LinearRegression # LR
from sklearn.ensemble import RandomForestRegressor # RF
from sklearn.tree import DecisionTreeRegressor # DT
from sklearn.neighbors import KNeighborsRegressor # KNN
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

# 모델들을 list를 통해 model에 입력
model = [SVR(), LinearRegression(), 
         RandomForestRegressor(), DecisionTreeRegressor(), KNeighborsRegressor()]

# print 할때 모델들 구분을 위해 모델 이름을 적은 list 생성
model_name = ["SVR", "LR", "RF", "DT", "KNN"]

# for문을 통해 model에 저장된 각 요소들을 idk에는 index, function에는 model들을 차례로 넣는다.
# 반복될 때마다 new_model에 SVC, LR, RF, DT, KNN 모델들이 각각 들어가서 훈련을 하고 각 모델 별로 점수를 측정한다.
for idk, function in enumerate(model):
    new_model = function
    new_model.fit(X_train3, y_train3)
    pred = new_model.predict(X_test3)
    print("모델 이름 = ", model_name[idk], ", 모델 평균제곱근오차 = ", mean_squared_error(pred, y_test3))


모델 이름 =  SVR , 모델 평균제곱근오차 =  44.56015600885398
모델 이름 =  LR , 모델 평균제곱근오차 =  51.57886634430023
모델 이름 =  RF , 모델 평균제곱근오차 =  42.059193110389614
모델 이름 =  DT , 모델 평균제곱근오차 =  94.87253246753247
모델 이름 =  KNN , 모델 평균제곱근오차 =  43.73737662337661


## 4. 3번과 동일하지만 dense layer만 사용한 신경망으로 회귀를 수행하라

## 데이터 불러오기

In [50]:
import numpy as np
import pandas as pd
url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/diabetes.csv'
df4 = pd.read_csv(url)
df4

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Outcome 삭제하기

In [52]:
df4 = df4.drop('Outcome', axis = 1)

## feature, target 데이터 분리하기

In [54]:
X4 = df4.drop('BMI',axis=1)
y4 = df4['BMI']

## 원 핫 인코딩 수행

In [56]:
Y4 = pd.get_dummies(y4).values

## Numpy로 변환

In [58]:
X4 = X4.values

## test data set, train data set, val data set나누기

In [60]:
from sklearn.model_selection import train_test_split
# (train data set + val data set)와 test data set을 8:2로 나누기
X_temp4, X_test4, y_temp4, y_test4 = train_test_split(X4,Y4,test_size=0.2,random_state=0) 
# train data set와 val data set 8:2로 나누기
X_train4, X_val4, y_train4, y_val4 = train_test_split(X_temp4, y_temp4, test_size=0.2, random_state=42)

In [61]:
X_train4.shape, y_train4.shape, X_test4.shape, y_test4.shape

((491, 7), (491, 248), (154, 7), (154, 248))

In [62]:
# 라이브러리 import
from keras.models import Sequential
from keras.layers import Dense

# 순차모델 생성
model = Sequential() 

model.add(Dense(32,input_shape=(7,),activation='relu'))  # 입력에 들어가는 데이터는 7개이므로 input_shape는 7
model.add(Dense(32,activation='relu')) 
model.add(Dense(1)) # 회귀는 출력층 노드 1개

model.compile(optimizer='rmsprop', loss='mse', metrics=['mse']) 


model.summary() # 모델 정보 확인

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [63]:
model_history=model.fit(x=X_train4, y=y_train4, epochs=30, batch_size=32, validation_data=(X_val4, y_val4))
# 모델 평가
test_loss, test_mse = model.evaluate(X_test4, y_test4)
print(f"테스트 mse : {test_mse}")

Epoch 1/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 251.1282 - mse: 251.1282 - val_loss: 8.3095 - val_mse: 8.3095
Epoch 2/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.1332 - mse: 7.1332 - val_loss: 4.3371 - val_mse: 4.3371
Epoch 3/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.6740 - mse: 4.6740 - val_loss: 5.4596 - val_mse: 5.4596
Epoch 4/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.6993 - mse: 3.6993 - val_loss: 2.1469 - val_mse: 2.1469
Epoch 5/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.2091 - mse: 3.2091 - val_loss: 4.0070 - val_mse: 4.0070
Epoch 6/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.9474 - mse: 3.9474 - val_loss: 4.7491 - val_mse: 4.7491
Epoch 7/30
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.15