# 전처리

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

Mounted at /content/drive


# 2022-06-21

In [None]:
# 머신러닝 핵심 알고리즘 - 1. 로지스틱 회귀모델
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data = pd.read_csv("breast-cancer-wisconsin.csv")

x = data[data.columns[:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
pred_test = model.predict(x_scaled_test)

from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print(confusion_train)

from sklearn.metrics import classification_report
report = classification_report(y_train, pred_train)
print(report)

from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 5, return_train_score = True)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {grid_search.best_score_}, {grid_search.best_params_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"C": randint(low = 0.001, high = 100)}
random_search = RandomizedSearchCV(LogisticRegression(), param_distrib, n_iter = 100, cv = 5, return_train_score = True)
random_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {random_search.best_score_}, {random_search.best_params_}")

[[328   5]
 [  9 170]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98       333
           1       0.97      0.95      0.96       179

    accuracy                           0.97       512
   macro avg       0.97      0.97      0.97       512
weighted avg       0.97      0.97      0.97       512

가장 높은 점수, 파라미터 : 0.972606129830573, {'C': 10}
가장 높은 점수, 파라미터 : 0.9745478774033887, {'C': 13}


In [None]:
# 머신러닝 핵심 알고리즘 - 2. K - Nearest Neighbor(KNN)
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.neighbors 안의 KNeighborsClassifier 와 KNeighborsRegressor가 바로 그것이다.
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data = pd.read_csv("breast-cancer-wisconsin.csv")

# 분류문제
x = data[data.columns[:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
model = KNeighborsClassifier()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
model.score(x_scaled_test, y_test)

from sklearn.model_selection import GridSearchCV
param_grid = {"n_neighbors": [1, 3, 5, 7, 9, 11]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {grid_search.best_score_}, {grid_search.best_params_}")
print(f"테스트 데이터 점수 : {grid_search.score(x_scaled_test, y_test)}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"n_neighbors": randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(KNeighborsClassifier(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {random_search.best_score_}, {random_search.best_params_}")
print(f"테스트 데이터 점수 : {random_search.score(x_scaled_test, y_test)}")


# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

param_grid = {"n_neighbors": [25, 30, 35, 40, 45, 50, 55, 60]}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {grid_search.best_score_}, {grid_search.best_params_}")
print(f"테스트 데이터 점수 : {grid_search.score(x_scaled_test, y_test)}")

param_distrib = {"n_neighbors": randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(KNeighborsRegressor(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {random_search.best_score_}, {random_search.best_params_}")
print(f"테스트 데이터 점수 : {random_search.score(x_scaled_test, y_test)}")

가장 높은 점수, 파라미터 : 0.9823910146582906, {'n_neighbors': 3}
테스트 데이터 점수 : 0.9532163742690059
가장 높은 점수, 파라미터 : 0.9823910146582906, {'n_neighbors': 3}
테스트 데이터 점수 : 0.9532163742690059
가장 높은 점수, 파라미터 : 0.621494422568473, {'n_neighbors': 35}
테스트 데이터 점수 : 0.6280997554371417
가장 높은 점수, 파라미터 : 0.6191282216089761, {'n_neighbors': 19}
테스트 데이터 점수 : 0.6275082445425657


In [None]:
# 머신러닝 핵심 알고리즘 - 3. Naive Bayes
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# 분류는 sklearn.naive_bayes 안의 GaussianNB
# 회귀는 sklearn.linear_model 안의 BayesianRidge가 적합하다.

# 분류문제
data = pd.read_csv("breast-cancer-wisconsin.csv")

x = data[data.columns[:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
print(f"기본 하이퍼파라미터 테스트 데이터 점수 : {model.score(x_scaled_test, y_test)}")

# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.linear_model import BayesianRidge
param_grid = {"alpha_1": [1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1, 2, 3, 4], "lambda_1": [1e-07, 1e-06, 1e-05, 1e-04, 1e-03, 1e-02, 1e-01, 1, 2, 3, 4]}
grid_search = GridSearchCV(BayesianRidge(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {grid_search.best_score_}, {grid_search.best_params_}")
print(f"테스트 데이터 점수 : {grid_search.score(x_scaled_test, y_test)}")

param_distrib = {"alpha_1": randint(low = 1e-06, high = 10), "alpha_1": randint(low = 1e-06, high = 10)}
random_search = RandomizedSearchCV(BayesianRidge(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 높은 점수, 파라미터 : {random_search.best_score_}, {random_search.best_params_}")
print(f"테스트 데이터 점수 : {random_search.score(x_scaled_test, y_test)}")

기본 하이퍼파라미터 테스트 데이터 점수 : 0.9590643274853801
가장 높은 점수, 파라미터 : 0.5702758458286303, {'alpha_1': 4, 'lambda_1': 1e-07}
테스트 데이터 점수 : 0.5826111202230753
가장 높은 점수, 파라미터 : 0.5702758458751108, {'alpha_1': 9}
테스트 데이터 점수 : 0.5826111181974689


In [None]:
# 머신러닝 핵심 알고리즘 - 4. 인공신경망
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.neural_network 안의 MLPClassifier 와 MLPRegressor 바로 그것이다.

# 분류문제
data = pd.read_csv("breast-cancer-wisconsin.csv")

x = data[data.columns[:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.neural_network import MLPClassifier
model = MLPClassifier()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
print(f"기본 하이퍼파라미터 테스트 데이터 점수 : {model.score(x_scaled_test, y_test)}")

# from sklearn.model_selection import GridSearchCV
# param_grid = {"hidden_layer_sizes": [10, 30, 50, 100], "solver": ["sgd", "adam"], "activation": ["tanh", "relu"]}
# grid_search = GridSearchCV(MLPClassifier(), param_grid, cv = 5)
# grid_search.fit(x_scaled_train, y_train)
# print(f"가장 높은 점수, 파라미터 : {grid_search.best_score_}, {grid_search.best_params_}")
# print(f"테스트 데이터 점수 : {grid_search.score(x_scaled_test, y_test)}")

# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint
# param_distrib = {"hidden_layer_sizes": randint(low = 10, high = 100), "solver": ["sgd", "adam"], "activation": ["tanh", "relu"]}
# random_search = RandomizedSearchCV(MLPClassifier(), param_distrib, cv = 5, n_iter = 10)
# random_search.fit(x_scaled_train, y_train)
# print(f"가장 높은 점수, 파라미터 : {random_search.best_score_}, {random_search.best_params_}")
# print(f"테스트 데이터 점수 : {random_search.score(x_scaled_test, y_test)}")


# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes = (64, 64, 64), activation = "relu", max_iter = 2000, random_state = 42)
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)

기본 하이퍼파라미터 테스트 데이터 점수 : 0.9649122807017544


# 2022-06-23

In [None]:
# 머신러닝 핵심 알고리즘 - 5. 서포트 벡터머신
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.svm 안의 SVC 와 SVR 바로 그것이다.

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.svm import SVC
model = SVC()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
pred_test = model.predict(x_scaled_test)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

from sklearn.metrics import confusion_matrix
confusion_test = confusion_matrix(y_test, pred_test)
# print(confusion_test)
from sklearn.metrics import classification_report
report = classification_report(y_test, pred_test)
# print(report)

from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.001, 0.01, 0.1, 1, 10, 100], "gamma": [0.001, 0.01, 0.1, 1, 10, 100], "kernel": ["rbf", "linear"]}
grid_search = GridSearchCV(SVC(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {grid_search.best_params_}")
print(f"가장 좋은 점수 : {grid_search.best_score_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"C": randint(low = 0.001, high = 100), "gamma": randint(low = 0.001, high = 100), "kernel": ["rbf", "linear"]}
random_search = RandomizedSearchCV(SVC(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {random_search.best_params_}")
print(f"가장 좋은 점수 : {random_search.best_score_}")


# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.svm import SVR
model = SVR(kernel = "poly")
model.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

훈련데이터 점수 : 0.984375
테스트데이터 점수 : 0.9649122807017544
가장 좋은 파라미터 : {'C': 1, 'gamma': 0.001, 'kernel': 'linear'}
가장 좋은 점수 : 0.9745669141442985
가장 좋은 파라미터 : {'C': 2, 'gamma': 90, 'kernel': 'linear'}
가장 좋은 점수 : 0.9745669141442985
훈련데이터 점수 : 0.4411532001585847
테스트데이터 점수 : 0.45698485085656304


# 2022-06-24

In [4]:
# 머신러닝 핵심 알고리즘 - 6. Decision Tree
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.tree 안의 DecisionTreeClassifier 와 DecisionTreeRegressor 바로 그것이다.

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
pred_test = model.predict(x_scaled_test)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

from sklearn.metrics import confusion_matrix
confusion_test = confusion_matrix(y_test, pred_test)
# print(confusion_test)
from sklearn.metrics import classification_report
report = classification_report(y_test, pred_test)
# print(report)

from sklearn.model_selection import GridSearchCV
param_grid = {"min_samples_leaf": range(2, 20, 2), "max_depth": range(1, 50, 2)}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {grid_search.best_params_}")
print(f"가장 좋은 점수 : {grid_search.best_score_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"min_samples_leaf": randint(low = 1, high = 50), "max_depth": randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {random_search.best_params_}")
print(f"가장 좋은 점수 : {random_search.best_score_}")


# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

from sklearn.model_selection import GridSearchCV
param_grid = {"min_samples_leaf": range(2, 20, 2), "max_depth": range(1, 50, 2)}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {grid_search.best_params_}")
print(f"가장 좋은 점수 : {grid_search.best_score_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"min_samples_leaf": randint(low = 1, high = 50), "max_depth": randint(low = 1, high = 20)}
random_search = RandomizedSearchCV(DecisionTreeRegressor(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {random_search.best_params_}")
print(f"가장 좋은 점수 : {random_search.best_score_}")

훈련데이터 점수 : 1.0
테스트데이터 점수 : 0.9590643274853801
가장 좋은 파라미터 : {'max_depth': 17, 'min_samples_leaf': 2}
가장 좋은 점수 : 0.9569388920616791
가장 좋은 파라미터 : {'max_depth': 8, 'min_samples_leaf': 1}
가장 좋은 점수 : 0.9608604606891301
훈련데이터 점수 : 1.0
테스트데이터 점수 : 0.30850389391214716
가장 좋은 파라미터 : {'max_depth': 9, 'min_samples_leaf': 18}
가장 좋은 점수 : 0.5792661683673163
가장 좋은 파라미터 : {'max_depth': 15, 'min_samples_leaf': 46}
가장 좋은 점수 : 0.584687936692273


In [9]:
# 머신러닝 핵심 알고리즘 - 7. Random Forest
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.ensemble 안의 RandomForestClassifier 와 RandomForestRegressor 바로 그것이다.

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_scaled_train, y_train)
pred_train = model.predict(x_scaled_train)
pred_test = model.predict(x_scaled_test)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

from sklearn.metrics import confusion_matrix
confusion_test = confusion_matrix(y_test, pred_test)
# print(confusion_test)
from sklearn.metrics import classification_report
report = classification_report(y_test, pred_test)
# print(report)

# help(RandomForestClassifier)

from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": range(100, 1001, 100), "max_features": ["auto", "log2"]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {grid_search.best_params_}")
print(f"가장 좋은 점수 : {grid_search.best_score_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"n_estimators": randint(low = 100, high = 1000), "max_features": ["auto", "log2"]}
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distrib, cv = 5, n_iter = 20)
random_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {random_search.best_params_}")
print(f"가장 좋은 점수 : {random_search.best_score_}")


# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

# for i in range(100, 500, 100):
#     model = RandomForestRegressor(n_estimators = i)
#     model.fit(x_scaled_train, y_train)
#     print("Hey")
#     print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
#     print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators": range(100, 500, 100)}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv = 5)
grid_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {grid_search.best_params_}")
print(f"가장 좋은 점수 : {grid_search.best_score_}")

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
param_distrib = {"n_estimators": randint(low = 100, high = 5500)}
random_search = RandomizedSearchCV(RandomForestRegressor(), param_distrib, cv = 5, n_iter = 5)
random_search.fit(x_scaled_train, y_train)
print(f"가장 좋은 파라미터 : {random_search.best_params_}")
print(f"가장 좋은 점수 : {random_search.best_score_}")

훈련데이터 점수 : 1.0
테스트데이터 점수 : 0.9649122807017544
훈련데이터 점수 : 0.9458032578290736
테스트데이터 점수 : 0.6221515067450613
Hey
훈련데이터 점수 : 0.9452424753324289
테스트데이터 점수 : 0.6251573034882381
Hey
훈련데이터 점수 : 0.947368268190133
테스트데이터 점수 : 0.6235303567620897
Hey
훈련데이터 점수 : 0.9476565133389108
테스트데이터 점수 : 0.6256947644006485
Hey
훈련데이터 점수 : 0.9477011535374694
테스트데이터 점수 : 0.6253613164318899


In [12]:
# 머신러닝 핵심 알고리즘 - 8. 투표기반 앙상블
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.ensemble 안의 VotingClassifier 와 VotingRegressor 바로 그것이다.

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

model_svc = SVC(random_state = 42, probability = True)
model_logistic = LogisticRegression(random_state = 42)
model_rf = RandomForestClassifier(random_state = 42)
voting_classifier = VotingClassifier([("svc", model_svc), ("lg", model_logistic), ("rf", model_rf)], voting = "soft")
voting_classifier.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {voting_classifier.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {voting_classifier.score(x_scaled_test, y_test)}")

# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
model_linear = LinearRegression()
model_rf = RandomForestRegressor(random_state = 42)
voting_regressor = VotingRegressor([("linear", model_linear), ("rf", model_rf)])
voting_regressor.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {voting_regressor.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {voting_regressor.score(x_scaled_test, y_test)}")

훈련데이터 점수 : 0.98828125
테스트데이터 점수 : 0.9649122807017544
훈련데이터 점수 : 0.8128809262725605
테스트데이터 점수 : 0.6269061036317223


In [14]:
# 머신러닝 핵심 알고리즘 - 9. 앙상블 배깅
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.ensemble 안의 BaggingClassifier 와 BaggingRegressor 바로 그것이다.

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(base_estimator = SVC(), n_estimators = 10, random_state = 42)
model.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(base_estimator = KNeighborsRegressor(), random_state = 42, n_estimators = 10)
model.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model.score(x_scaled_test, y_test)}")

훈련데이터 점수 : 0.982421875
테스트데이터 점수 : 0.9649122807017544
훈련데이터 점수 : 0.7305248851529351
테스트데이터 점수 : 0.6020446422233702


In [17]:
# 머신러닝 핵심 알고리즘 - 10. 앙상블 부스팅
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.ensemble 안의 AdaBoostClassifier 와 GradientBoostingClassifier가 분류
# AdaBoostRegressor, GradientBoostingRegressor가 회귀

# 분류문제

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("breast-cancer-wisconsin.csv")
# data.head()

x = data[data.columns[1:10]]
y = data[["Class"]]

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, stratify = y)

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
model_ada = AdaBoostClassifier(n_estimators = 100, random_state = 42)
model_grad = GradientBoostingClassifier(random_state = 42)
model_ada.fit(x_scaled_train, y_train)
model_grad.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model_ada.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model_ada.score(x_scaled_test, y_test)}")
print(f"훈련데이터 점수 : {model_grad.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model_grad.score(x_scaled_test, y_test)}")

# 회귀문제
data2 = pd.read_csv("house_price.csv")
x = data2[data2.columns[:5]]
y = data2[["house_value"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42)
scaler_minmax.fit(x_train)
x_scaled_train = scaler_minmax.transform(x_train)
x_scaled_test = scaler_minmax.transform(x_test)

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
model_ada = AdaBoostRegressor(n_estimators = 100, random_state = 42)
model_grad = GradientBoostingRegressor(random_state = 42)
model_ada.fit(x_scaled_train, y_train)
model_grad.fit(x_scaled_train, y_train)
print(f"훈련데이터 점수 : {model_ada.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model_ada.score(x_scaled_test, y_test)}")
print(f"훈련데이터 점수 : {model_grad.score(x_scaled_train, y_train)}")
print(f"테스트데이터 점수 : {model_grad.score(x_scaled_test, y_test)}")

훈련데이터 점수 : 1.0
테스트데이터 점수 : 0.9532163742690059
훈련데이터 점수 : 1.0
테스트데이터 점수 : 0.9649122807017544
훈련데이터 점수 : 0.4542136350769076
테스트데이터 점수 : 0.4484649534271251
훈련데이터 점수 : 0.6528129290117282
테스트데이터 점수 : 0.6253634291616165


In [None]:
# 머신러닝 핵심 알고리즘 - 11. 앙상블 스태킹
# 회귀문제와 분류문제 둘다에 쓸 수 있다.
# sklearn.ensemble 안의 StackingClassifier 와 StackingRegressor 바로 그것이다.

# 분류문제



In [None]:
# 머신러닝 핵심 알고리즘 - 12. 선형회귀모델
# Only 회귀문제에만 적용가능.
# sklearn.linear_model 안의 LinearRegression 을 사용하면 된다.



In [None]:
# 머신러닝 핵심 알고리즘 - 13. 릿지회귀모델
# Only 회귀문제에만 적용가능.
# sklearn.linear_model 안의 Ridge 를 사용하면 된다.
# 하이퍼파라미터 알파가 0이면 LinearRegression과 동일.



In [None]:
# 머신러닝 핵심 알고리즘 - 14. 라쏘회귀모델
# Only 회귀문제에만 적용가능.
# sklearn.linear_model 안의 Lasso 를 사용하면 된다.


In [None]:
# 머신러닝 핵심 알고리즘 - 15. 엘라스틱넷
# Only 회귀문제에만 적용가능.
# sklearn.linear_model 안의 ElasticNet 를 사용하면 된다.


In [None]:
# 머신러닝 핵심 알고리즘 - 16. 군집분석
# 비지도학습 알고리즘
# sklearn.cluster 안의 KMeans 를 보통 많이 사용한다.
# 코드는 생략하겠다. 군집분석이 시험에 나오면 포기하자.


In [None]:
# 머신러닝 핵심 알고리즘 - 17. DBSCAN
# 비지도학습 알고리즘
# sklearn.cluster 안의 DBSCAN 을 사용하면 된다.
# 코드는 생략하겠다. DBSCAN이 시험에 나오면 포기하자.


In [None]:
# 머신러닝 핵심 알고리즘 - 18. 연관규칙분석
# 비지도학습 알고리즘
# apyori 안의 apriori 를 사용하면 된다.
# 코드는 생략하겠다. 연관규칙분석이 시험에 나오면 포기하자.
