In [45]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso

In [46]:
# 데이터
URL='http://lib.stat.cmu.edu/datasets/boston'

In [47]:
# 데이터 로딩
bostonDF=pd.read_csv(URL, skiprows=22, header=None, sep='\s+')

In [48]:
# 컬럼명 인덱스 확인
bostonDF.columns, bostonDF.index

(Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64'),
 RangeIndex(start=0, stop=1012, step=1))

In [49]:
df1=bostonDF.iloc[::2].reset_index(drop=True)
df2=bostonDF.iloc[1::2, :3].reset_index(drop=True)

In [50]:
bostonDF=pd.concat([df1, df2], axis=1)

bostonDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       506 non-null    float64
 1   1       506 non-null    float64
 2   2       506 non-null    float64
 3   3       506 non-null    float64
 4   4       506 non-null    float64
 5   5       506 non-null    float64
 6   6       506 non-null    float64
 7   7       506 non-null    float64
 8   8       506 non-null    float64
 9   9       506 non-null    float64
 10  10      506 non-null    float64
 11  0       506 non-null    float64
 12  1       506 non-null    float64
 13  2       506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


In [51]:
bostonDF.columns=['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 
                  'RM', 'AGE', 'DIS', 'RAD', 'TAX', 
                  'PTRATIO', 'B', 'LSTAT', 'MEDV']

In [52]:
bostonDF

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [53]:
# data, target
input_data = bostonDF.iloc[:, :-1]
input_target = bostonDF.iloc[:, -1]

In [54]:
input_data.shape, input_target.shape

((506, 13), (506,))

In [55]:
# 학습 데이터 분리
train_X, test_X, train_y, test_y = train_test_split(input_data,
                                                   input_target,
                                                   test_size=0.2,
                                                   random_state=42)

### LinearRegression

In [56]:
# 모델 객체 생성
lr = LinearRegression()

In [57]:
# 모델 학습
lr.fit(train_X, train_y)

LinearRegression()

In [58]:
# 모델 평가
print('train set score : ', lr.score(train_X, train_y),
      '\ntest set score : ', lr.score(test_X, test_y))

train set score :  0.7508856358979673 
test set score :  0.6687594935356294


In [59]:
np.mean(lr.coef_)

-0.9719984492158389

### Ridge

In [60]:
ridge = Ridge()  # 기본 alpha = 1.0
ridge.fit(train_X, train_y)
print('<alpha = 1.0>')
print('훈련 세트 점수 : ', ridge.score(train_X, train_y),
     '\n테스트 세트 점수 : ', ridge.score(test_X, test_y))
np.mean(ridge.coef_)

<alpha = 1.0>
훈련 세트 점수 :  0.7487825119234439 
테스트 세트 점수 :  0.6662221670168519


-0.3885169968048653

In [61]:
ridge = Ridge(alpha=10)  # 기본 alpha = 1.0
ridge.fit(train_X, train_y)
print('<alpha = 10>')
print('훈련 세트 점수 : ', ridge.score(train_X, train_y),
     '\n테스트 세트 점수 : ', ridge.score(test_X, test_y))
np.mean(ridge.coef_)

<alpha = 10>
훈련 세트 점수 :  0.7417486732924389 
테스트 세트 점수 :  0.6638882305470701


0.13062053104061497

In [62]:
ridge = Ridge(alpha=100)  # 기본 alpha = 1.0
ridge.fit(train_X, train_y)
print('<alpha = 100>')
print('훈련 세트 점수 : ', ridge.score(train_X, train_y),
     '\n테스트 세트 점수 : ', ridge.score(test_X, test_y))
np.mean(ridge.coef_)

<alpha = 100>
훈련 세트 점수 :  0.7230438194128697 
테스트 세트 점수 :  0.6800123820451881


0.025557554243519243

### Lasso

In [63]:
lasso = Lasso()  # 기본 alpha = 1.0
lasso.fit(train_X, train_y)
print('<alpha = 1.0>')
print('훈련 세트 점수 : ', lasso.score(train_X, train_y),
     '\n테스트 세트 점수 : ', lasso.score(test_X, test_y))
np.mean(lasso.coef_)

<alpha = 1.0>
훈련 세트 점수 :  0.6959153721670908 
테스트 세트 점수 :  0.6671453631686305


-0.020709413895453936

In [64]:
lasso = Lasso(alpha=0.1)
lasso.fit(train_X, train_y)
print('<alpha=0.1>')
print('훈련 세트 점수 : ', lasso.score(train_X, train_y),
     '\n테스트 세트 점수 : ', lasso.score(test_X, test_y))
np.mean(lasso.coef_)

<alpha=0.1>
훈련 세트 점수 :  0.7382419735910873 
테스트 세트 점수 :  0.6569712802223936


0.22472465460412247

In [65]:
lasso = Lasso(alpha=0.01)
lasso.fit(train_X, train_y)
print('<alpha=0.01>')
print('훈련 세트 점수 : ', lasso.score(train_X, train_y),
     '\n테스트 세트 점수 : ', lasso.score(test_X, test_y))
np.mean(lasso.coef_)

<alpha=0.01>
훈련 세트 점수 :  0.7504622986421127 
테스트 세트 점수 :  0.6685929109949444


-0.7250427652821959

# 

# 다양한 교차검증
---
- model_selection 모듈
    - cross_val_score()
    - cross_val_validate()
    - cross_val_predict()

In [67]:
# 모듈 로딩
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

## [1] 데이터 로딩
---

In [68]:
iris = load_iris()

In [69]:
# Bunch 타입 => dict
data = iris['data']
target = iris['target']
featureName = iris['feature_names']  # 컬럼명
className = iris['target_names']

In [70]:
featureName, className

(['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 array(['setosa', 'versicolor', 'virginica'], dtype='<U10'))

## [2] 모델 생성
---

In [71]:
# max_iter : 처음~끝까지 샘플 데이터 학습 => 에포크(epoch) 횟수
lrModel = LogisticRegression(max_iter=500)

In [73]:
## 교차 검증으로 모델 학습 진행
## => 학습 데이터 기반 5개 Fold로 학습 & 검증 진행
result = cross_val_score(lrModel, 
                         data,
                         target)  # cv 모델 개수?

In [74]:
# 5개 모델에 대한 정확도(accuracy)
result

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [76]:
allResult = cross_validate(lrModel,
                           data, 
                           target,
                          return_train_score=True,
                          cv=7)

In [78]:
resultDF = pd.DataFrame(allResult)
resultDF

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.079532,0.001438,0.954545,0.96875
1,0.071125,0.001001,1.0,0.96875
2,0.061548,0.0,0.909091,0.976562
3,0.063382,0.001067,0.952381,0.976744
4,0.050866,0.0,0.952381,0.992248
5,0.066249,0.001714,1.0,0.968992
6,0.096876,0.000999,1.0,0.976744


In [79]:
## Splitter 객체 생성 - KFold()
from sklearn.model_selection import KFold, StratifiedKFold

In [82]:
kSplitter = KFold(n_splits = 7, shuffle = True)
sSplitter = StratifiedKFold(n_splits = 7, shuffle = True)  # 균등하게 분할?

# parameter -------------------------------------------------
# return_train_score : 학습 데이터 평가 결과 반환 여부 설정
# return_estimator : 모델 객체 반환 여부 설정
allResult = cross_validate(lrModel, data, target,
                          return_train_score=True,
                          return_estimator=True,
                          cv=sSplitter)

In [83]:
resultDF = pd.DataFrame(allResult)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_score,train_score
0,0.073069,0.0,LogisticRegression(max_iter=500),1.0,0.976562
1,0.058125,0.002567,LogisticRegression(max_iter=500),0.954545,0.96875
2,0.067316,0.002556,LogisticRegression(max_iter=500),0.954545,0.96875
3,0.065158,0.0,LogisticRegression(max_iter=500),1.0,0.968992
4,0.067973,0.000516,LogisticRegression(max_iter=500),0.904762,0.984496
5,0.052991,0.000996,LogisticRegression(max_iter=500),1.0,0.968992
6,0.06734,0.0,LogisticRegression(max_iter=500),0.952381,0.984496
