In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')  # 사이킷런 1.2 부터는 보스턴 주택가격 데이터가 없어진다는 warning 메시지 출력 제거

In [2]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [3]:
boston_df = pd.DataFrame(
    data=data,
    columns=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT']
)

boston_df['PRICE']=target
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


#데이터 분할

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(
    boston_df.drop('PRICE', axis=1),
    boston_df['PRICE'],
    test_size=0.2,
    random_state=42
)

#모델 훈련

In [5]:
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

#예측

In [6]:
y_pred = lr_reg.predict(X_test)
y_pred[:3]

array([28.99672362, 36.02556534, 14.81694405])

#평가

In [7]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE : {:.3f}".format(mse))
print("RMSE : {:.3f}".format(rmse))

print()
print("R2 : {:.3f}".format(r2))
print("Score : {:.3f}".format(lr_reg.score(X_test, y_test)))

MSE : 24.291
RMSE : 4.929

R2 : 0.669
Score : 0.669


In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    LinearRegression(),
    X_train, y_train,
    cv=5,
    scoring='neg_mean_squared_error'
)

mse_scores = scores * (-1)
rmse_scores = np.sqrt(mse_scores)
mean_rmse = rmse_scores.mean()

mean_rmse

4.829493065236907

In [9]:
boston_df.describe()['PRICE']

count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: PRICE, dtype: float64

#다항회귀를 활용한 보스턴 주택가격

In [10]:
from sklearn.preprocessing import PolynomialFeatures

sample = np.arange(6).reshape(3,2)
print("1차 다항식 feature : \n{}".format(sample))

1차 다항식 feature : 
[[0 1]
 [2 3]
 [4 5]]


In [11]:
poly = PolynomialFeatures(degree=2) # degree는 다항식의 차수
sample_ftr = poly.fit_transform(sample)

sample_ftr

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [12]:
boston_poly = PolynomialFeatures(degree=2, include_bias=False)

X_poly_2 = boston_poly.fit_transform(X_train)
X_poly_2.shape, X_train.shape

((404, 104), (404, 13))

In [13]:
lr_reg_poly = LinearRegression().fit(X_poly_2,y_train)

In [14]:
X_test.shape

(102, 13)

In [15]:
X_poly_test_2 = boston_poly.transform(X_test)

In [16]:
y_test_pred = lr_reg_poly.predict(X_poly_test_2)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2  = r2_score(y_test, y_test_pred)

print("Test MSE : {:.3f}".format(test_mse))
print("Test RMSE : {:.3f}".format(test_rmse))
print("Test R2 : {:.3f}".format(test_r2))

Test MSE : 14.257
Test RMSE : 3.776
Test R2 : 0.806


In [17]:
y_pred = lr_reg_poly.predict(X_poly_2)

mse  = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_train, y_pred)

print("MSE : {:.3f} / RMSE : {:.3f} / R2 : {:.3f}".format(mse, rmse, r2))

MSE : 5.131 / RMSE : 2.265 / R2 : 0.941


#규제 선형회귀

In [37]:
def show_evaluate(y, y_pred, label=None):
  mse = mean_squared_error(y, y_pred)
  rmse = np.sqrt(test_mse)
  r2  = r2_score(y, y_pred)

  print("{} MSE : {:.3f}".format(label, mse))
  print("{} RMSE : {:.3f}".format(label, rmse))
  print("{} R2 : {:.3f}".format(label,r2))

In [19]:
boston_poly = PolynomialFeatures(degree=3, include_bias=False)

X_poly_3 = boston_poly.fit_transform(X_train)
X_poly_3.shape, X_train.shape

((404, 559), (404, 13))

In [20]:
lr = LinearRegression().fit(X_poly_3,y_train)

In [38]:
X_test_poly_3 = boston_poly.transform(X_test)
show_evaluate(y_train, lr.predict(X_poly_3), "train")
show_evaluate(y_test, lr.predict(X_test_poly_3),"test")

train MSE : 0.000
train RMSE : 3.776
train R2 : 1.000
test MSE : 129847.995
test RMSE : 3.776
test R2 : -1769.644


위 선형 회귀의 결과는 과대적합!, 가중치가 너무 크다
#Ridge

In [42]:
from sklearn.linear_model import Ridge

ridge_a_10 = Ridge(alpha=10)
ridge_a_10.fit(X_poly_3, y_train)

In [43]:
ridge_10_train_pred = ridge_a_10.predict(X_poly_3)
ridge_10_test_pred = ridge_a_10.predict(X_test_poly_3)

show_evaluate(y_train, ridge_10_train_pred, "Train")
show_evaluate(y_test, ridge_10_test_pred, "Test")

Train MSE : 3.316
Train RMSE : 3.776
Train R2 : 0.962
Test MSE : 169.674
Test RMSE : 3.776
Test R2 : -1.314


In [51]:
ridge_a_10.coef_.max()

0.06836341511069377

#Lasso

In [39]:
from sklearn.linear_model import Lasso

lasso_a_10 = Lasso(alpha=10).fit(X_poly_3, y_train)

In [40]:
lasso_10_train_pred = lasso_a_10.predict(X_poly_3)
lasso_10_test_pred = lasso_a_10.predict(X_test_poly_3)

show_evaluate(y_train, lasso_10_train_pred, "Train")
show_evaluate(y_test, lasso_10_test_pred, "Test")

Train MSE : 4.366
Train RMSE : 3.776
Train R2 : 0.950
Test MSE : 13.241
Test RMSE : 3.776
Test R2 : 0.819


In [46]:
lasso_a_10.coef_.max()

0.0033840619872613402

#LogisticRegression

In [52]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

In [53]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# StandardScaler( )로 평균이 0, 분산 1로 데이터 분포도 변환
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train , X_test, y_train , y_test = train_test_split(data_scaled, cancer.target, test_size=0.3, random_state=0)

In [54]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
lr_preds = lr_clf.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score, roc_auc_score

print("정확도 : {}".format(accuracy_score(lr_preds,y_test)))
print("ROC AUC : {}".format(roc_auc_score(lr_preds,y_test)))

정확도 : 0.9766081871345029
ROC AUC : 0.9781669150521609
