## 1. 머신러닝 분류

### (1). 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd
url = 'C:/dataset/abalone.csv'
df = pd.read_csv(url, index_col=0)
df


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


### (2). 범주형 데이터를 처리하기 위해 원-핫 인코딩

In [2]:
df = pd.get_dummies(df, columns=['Sex'])

### (3). feature, target data set 분리

In [3]:
X = df.drop('Rings', axis=1) # 문제는 X
y = df['Rings'] # 답은 y

### (4). cut함수 사용전 'Ring'의 최소값, 최대값 확인하기

In [4]:
max = df['Rings'].max()
min = df['Rings'].min()
print("Rings의 최대값 : ", max)
print("Rings의 최소값 : ", min)

Rings의 최대값 :  29
Rings의 최소값 :  1


### (5). cut함수를 통해 연속형을 범주형으로 수정

회귀가 아닌 분류이기 때문에 cut함수를 통해 y값을 범주형으로 바꿔주되 
Rings의 최대 최소값을 확인하여 0-5, 5-10, 10-15, 15-20, 20-25, 25-30를 
각각 0, 1, 2, 3, 4, 5로 수정해준다

In [5]:
Y = pd.cut(y, bins=[0, 5, 10, 15, 20, 25, 30], labels=[0, 1, 2, 3, 4, 5])
Y

id
0       2
1       1
2       1
3       1
4       1
       ..
4172    2
4173    1
4174    1
4175    1
4176    2
Name: Rings, Length: 4177, dtype: category
Categories (6, int64): [0 < 1 < 2 < 3 < 4 < 5]

### (6). test, train data set 분리 

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.25,random_state=0)

### (7). 머신러닝 5가지 분류 수행

In [7]:
# 라이브러리 불러오기
from sklearn.svm import SVC # SVC
from sklearn.linear_model import LogisticRegression # LR
from sklearn.ensemble import RandomForestClassifier # RF
from sklearn.tree import DecisionTreeClassifier # DT
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# 모델들을 list를 통해 model에 입력
model = [SVC(random_state=0), LogisticRegression(random_state=0), 
         RandomForestClassifier(random_state=0), DecisionTreeClassifier(random_state=0), KNeighborsClassifier()]

# print 할때 모델들 구분을 위해 모델 이름을 적은 list 생성
model_name = ["SVC", "LR", "RF", "DT", "KNN"]

# for문을 통해 model에 저장된 각 요소들을 idk에는 index, function에는 model들을 차례로 넣는다.
# 반복될 때마다 new_model에 SVC, LR, RF, DT, KNN 모델들이 각각 들어가서 훈련을 하고 각 모델 별로 점수를 측정한다.
for idk, function in enumerate(model):
    new_model = function
    new_model.fit(X_train, Y_train)
    pred = new_model.predict(X_test)
    print("모델 이름 = ", model_name[idk], ", 모델 accuracy_score = ", accuracy_score(Y_test,pred))
    print ("confusion_matrix = ", confusion_matrix(Y_test, pred))

모델 이름 =  SVC , 모델 accuracy_score =  0.6602870813397129
confusion_matrix =  [[ 29  31   0   0   0   0]
 [  6 560  41   0   0   0]
 [  0 218 101   0   0   0]
 [  0  22  30   0   0   0]
 [  0   2   4   0   0   0]
 [  0   0   1   0   0   0]]
모델 이름 =  LR , 모델 accuracy_score =  0.6708133971291866
confusion_matrix =  [[ 27  33   0   0   0   0]
 [  5 557  45   0   0   0]
 [  0 202 116   1   0   0]
 [  0  19  32   1   0   0]
 [  0   0   4   2   0   0]
 [  0   0   1   0   0   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


모델 이름 =  RF , 모델 accuracy_score =  0.6851674641148325
confusion_matrix =  [[ 40  20   0   0   0   0]
 [ 10 522  72   3   0   0]
 [  0 162 148   9   0   0]
 [  0  12  34   6   0   0]
 [  0   0   5   1   0   0]
 [  0   0   1   0   0   0]]
모델 이름 =  DT , 모델 accuracy_score =  0.6086124401913876
confusion_matrix =  [[ 38  22   0   0   0   0]
 [ 18 455 114  18   2   0]
 [  0 153 131  30   2   3]
 [  0  14  25  12   0   1]
 [  0   0   4   2   0   0]
 [  0   0   1   0   0   0]]
모델 이름 =  KNN , 모델 accuracy_score =  0.6564593301435406
confusion_matrix =  [[ 35  25   0   0   0   0]
 [ 10 515  80   2   0   0]
 [  0 175 130  14   0   0]
 [  0  16  30   6   0   0]
 [  0   0   2   4   0   0]
 [  0   0   1   0   0   0]]


## 2. 머신러닝 회귀

### (1). 데이터 불러오기

In [8]:
import numpy as np
import pandas as pd
url = 'C:/dataset/abalone.csv'
df2 = pd.read_csv(url, index_col=0)
df2


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


### (2). 범주형 데이터를 처리하기 위해 원-핫 인코딩

In [9]:
df2 = pd.get_dummies(df2, columns=['Sex'])

### (3). feature와 target 데이터 분리

In [10]:
X2 = df.drop('Rings', axis=1) # 문제는 X
y2 = df['Rings'] # 답은 y

### (4). train, test data set 분리

해당 문제는 분류가 아닌 회귀 즉, 특정 값을 예측하는 문제이므로 cut함수를 통해
y 값을 범주형으로 변경할 필요가 없다.

In [11]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, y2,test_size=0.25,random_state=0)

In [12]:
# 라이브러리 불러오기
from sklearn.svm import SVR # SVR
from sklearn.linear_model import LinearRegression # LR
from sklearn.ensemble import RandomForestRegressor # RF
from sklearn.tree import DecisionTreeRegressor # DT
from sklearn.neighbors import KNeighborsRegressor # KNN
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

# 모델들을 list를 통해 model에 입력
model = [SVR(), LinearRegression(), 
         RandomForestRegressor(), DecisionTreeRegressor(), KNeighborsRegressor()]

# print 할때 모델들 구분을 위해 모델 이름을 적은 list 생성
model_name = ["SVR", "LR", "RF", "DT", "KNN"]

# for문을 통해 model에 저장된 각 요소들을 idk에는 index, function에는 model들을 차례로 넣는다.
# 반복될 때마다 new_model에 SVC, LR, RF, DT, KNN 모델들이 각각 들어가서 훈련을 하고 각 모델 별로 점수를 측정한다.
for idk, function in enumerate(model):
    new_model = function
    new_model.fit(X_train2, Y_train2)
    pred = new_model.predict(X_test2)
    print("모델 이름 = ", model_name[idk], ", 모델 평균제곱근오차 = ", mean_squared_error(pred, Y_test2))

모델 이름 =  SVR , 모델 평균제곱근오차 =  5.226147248116119
모델 이름 =  LR , 모델 평균제곱근오차 =  4.803405133845133
모델 이름 =  RF , 모델 평균제곱근오차 =  4.765753205741627
모델 이름 =  DT , 모델 평균제곱근오차 =  8.993301435406698
모델 이름 =  KNN , 모델 평균제곱근오차 =  5.0401531100478465


## 3. 딥러닝 분류

### (1). 데이터 불러오기

In [13]:
import numpy as np
import pandas as pd

url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/abalone.csv' 
df3 = pd.read_csv(url, index_col=0)
df3


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [14]:
df3.shape

(4177, 9)

### (2). 범주형 데이터를 처리하기 위해 원-핫 인코딩

In [15]:
df3 = pd.get_dummies(df3, columns=['Sex']) # 원 핫 인코딩으로 인해 input_size 2증가

### (3). feature, target data set 분리

In [16]:
X3 = df3.drop('Rings', axis=1) # 문제는 X
y3 = df3['Rings'] # 답은 y

### (4). cut으로 자를 범위를 정하기 위해 Rings의 최대 최소값 측정

In [17]:
max = df3['Rings'].max()
min = df3['Rings'].min()
print("Rings의 최대값 : ", max)
print("Rings의 최소값 : ", min)

Rings의 최대값 :  29
Rings의 최소값 :  1


### (5). cut 함수를 통해 y3을 0-5는 0, 5-10은 1, 10-15는 2, 15-20은 3, 20-25는 4, 25-30은 5로 범주형데이터로 수정

In [18]:
Y3 = pd.cut(y3, bins=[0, 5, 10, 15, 20, 25, 30], labels=[0, 1, 2, 3, 4, 5])

### (6). target을 one-hot 인코팅

In [19]:
Y3 = pd.get_dummies(Y3).values

### (7). 딥러닝에 집어넣기 위해 numpy로 변환, float32로 변환

In [20]:
X3 = X3.values
X3 = X3.astype(np.float32) # 처리하지 못하는 데이터 형 대신 float32로 수정

### (8). test data set, train data set, val data set나누기

In [21]:
from sklearn.model_selection import train_test_split
# (train data set + val data set)와 test data set을 8:2로 나누기
X_temp3, X_test3, y_temp3, y_test3 = train_test_split(X3,Y3,test_size=0.2,random_state=0) 
# train data set와 val data set 8:2로 나누기
X_train3, X_val3, y_train3, y_val3 = train_test_split(X_temp3, y_temp3, test_size=0.2, random_state=42)

### (9). 모델 생성

In [22]:
# 라이브러리 import
from keras.models import Sequential
from keras.layers import Dense

# 순차모델 생성
model = Sequential() 

model.add(Dense(32,input_shape=(10,),activation='relu'))  # 입력에 들어가는 데이터는 10개이므로 input_shape는 8
model.add(Dense(32,activation='relu')) 
model.add(Dense(6,activation='softmax')) # 0, 1, 2, 3, 4, 5로 분류
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) 

model.summary() # 모델 정보 확인

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### (9). 훈련 및 정확도 측정

In [23]:

model_history=model.fit(x=X_train3, y=y_train3, epochs=30, batch_size=32, validation_data=(X_val3, y_val3))

# 모델 평가
test_loss, test_acc = model.evaluate(X_test3, y_test3)
print(f"테스트 정확도 : {test_acc}")

Epoch 1/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.5427 - loss: 1.4846 - val_accuracy: 0.5964 - val_loss: 0.9608
Epoch 2/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6224 - loss: 0.9339 - val_accuracy: 0.5949 - val_loss: 0.9009
Epoch 3/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6461 - loss: 0.8630 - val_accuracy: 0.6472 - val_loss: 0.8460
Epoch 4/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6368 - loss: 0.8545 - val_accuracy: 0.6547 - val_loss: 0.8177
Epoch 5/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6459 - loss: 0.8257 - val_accuracy: 0.6398 - val_loss: 0.8159
Epoch 6/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6502 - loss: 0.8201 - val_accuracy: 0.6547 - val_loss: 0.7781
Epoch 7/30
[1m84/84[0m [32m━━━━━━━━━━

## 4. 딥러닝 회귀

### 1. 데이터 불러오기

In [25]:
import numpy as np
import pandas as pd

url = 'https://raw.githubusercontent.com/MyungKyuYi/AI-class/main/abalone.csv' 
df4 = pd.read_csv(url, index_col=0)
df4


Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


### (2). 범주형 데이터 처리를 위한 원-핫 인코딩

In [26]:
df4 = pd.get_dummies(df4, columns=['Sex'])

### (3). feature, target data 분리

In [27]:
X4 = df4.drop('Rings', axis=1) # 문제는 X
y4 = df4['Rings'] # 답은 y

### (4). 원-핫 인코딩

In [29]:
Y4 = pd.get_dummies(y4).values

### (5). 딥러닝에 집어넣기 위해 numpy로 변환, float32로 변환

In [30]:
X4 = X4.values
X4 = X4.astype(np.float32) # 처리하지 못하는 데이터 형 대신 float32로 수정

### (6). test data set, train data set, val data set나누기

In [31]:
from sklearn.model_selection import train_test_split
# (train data set + val data set)와 test data set을 8:2로 나누기
X_temp4, X_test4, y_temp4, y_test4 = train_test_split(X4,Y4,test_size=0.2,random_state=0) 
# train data set와 val data set 8:2로 나누기
X_train4, X_val4, y_train4, y_val4 = train_test_split(X_temp4, y_temp4, test_size=0.2, random_state=42)

### (7). 모델 생성

In [32]:
# 라이브러리 import
from keras.models import Sequential
from keras.layers import Dense

# 순차모델 생성

model = Sequential() 

model.add(Dense(32,input_shape=(10,),activation='relu'))  # 입력에 들어가는 데이터는 10개이므로 input_shape는 8
model.add(Dense(32,activation='relu')) 
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mse']) 

model.summary() # 모델 정보 확인

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### (8). 모델 학습 및 평가

In [33]:
model_history=model.fit(x=X_train4, y=y_train4, epochs=30, batch_size=32, validation_data=(X_val4, y_val4))
# 모델 평가
test_loss, test_mse = model.evaluate(X_test4, y_test4)
print(f"테스트 mse : {test_mse}")

Epoch 1/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0370 - mse: 0.0370 - val_loss: 0.0350 - val_mse: 0.0350
Epoch 2/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0349 - mse: 0.0349 - val_loss: 0.0347 - val_mse: 0.0347
Epoch 3/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0349 - mse: 0.0349 - val_loss: 0.0346 - val_mse: 0.0346
Epoch 4/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0348 - mse: 0.0348 - val_loss: 0.0346 - val_mse: 0.0346
Epoch 5/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0347 - mse: 0.0347 - val_loss: 0.0347 - val_mse: 0.0347
Epoch 6/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0347 - mse: 0.0347 - val_loss: 0.0345 - val_mse: 0.0345
Epoch 7/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0346 -