In [2]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

## [1] 검증 데이터셋+for 문

In [3]:
l_eta0=[0.0001, 0.001, 0.01, 0.1] 
l_alpha=[0.0001, 0.001, 0.01, 0.1 ,1, 10, 100] 

In [4]:
X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=1234)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train,
                                                    test_size=0.25, 
                                                    random_state=1234)
best_R2=0
for eta0 in l_eta0:
    for alpha in l_alpha:
        reg = SGDRegressor(penalty='l2', max_iter=100000, learning_rate='constant', eta0=eta0,alpha=alpha, random_state=1234)
        reg = reg.fit(X_train, y_train)
        current_R2=reg.score(X_val,y_val)
        if current_R2>best_R2:
            best_R2=current_R2
            best_eta0=eta0
            best_alpha=alpha
reg = SGDRegressor(penalty='l2', max_iter=100000, learning_rate='constant', eta0=best_eta0,alpha=best_alpha, random_state=1234)      
reg = reg.fit(X_train, y_train)
test_R2=reg.score(X_test,y_test)


print('최적 러닝레이트:',best_eta0)
print('최적 정규화 계수:',best_alpha)
print('검증 데이터셋 R2:',best_R2)
print('테스트 데이터셋 R2:',test_R2)


최적 러닝레이트: 0.0001
최적 정규화 계수: 0.0001
검증 데이터셋 R2: 0.53495862464611
테스트 데이터셋 R2: 0.4731219201091438


## [2] GridSearchCV

In [5]:
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=1234)

param_grid={'alpha':l_alpha, 'eta0':l_eta0}
kfold=KFold(n_splits=4, shuffle=True, random_state=1234)  ## KFold 객체 https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.KFold.html 참고
grid_search=GridSearchCV(SGDRegressor(penalty='l2', max_iter=100000, learning_rate='constant', random_state=1234), param_grid=param_grid, cv=kfold)
grid_search.fit(X_train,y_train)
y_pred_test = grid_search.predict(X_test)
test_MAE=np.abs(y_pred_test - y_test).mean()
print('최적 러닝레이트:',grid_search.best_params_['eta0'])
print('최적 정규화 계수:',grid_search.best_params_['alpha'])
print('최적 모델의 교차검증 스코어', grid_search.best_score_)
print('테스트 데이터셋 R2:',grid_search.score(X_test,y_test))


최적 러닝레이트: 0.0001
최적 정규화 계수: 0.0001
최적 모델의 교차검증 스코어 0.4859305335296367
테스트 데이터셋 R2: 0.4723402669032044


### 실습 1. tips data에서 tip을 나머지 피처로 릿지회귀 하는 문제에서, 최적의 하이퍼파라미터를 찾으세요. (4-겹 교차검증을 통해  l_alpha, l_eta0 중에서 최적의 alpha와 eta0를 찾으세요.)

In [6]:
import pandas as pd

tips = pd.read_csv("examples/tips.csv")
tips.head()
tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})
X_train, X_test, y_train, y_test = train_test_split(tips.drop(columns="tip"),tips.tip,test_size=0.2,random_state=1234)

## [3] 결측치 처리 객체 (SimpleImputer)

In [7]:
from sklearn.impute import SimpleImputer

X, y = load_diabetes(return_X_y=True, as_frame=True)
X.iloc[np.random.randint(0,X.shape[0],10),np.random.randint(0,X.shape[1],3)]=np.nan
X.isna().sum()

age     0
sex    10
bmi     0
bp      0
s1      0
s2      0
s3     10
s4      0
s5     10
s6      0
dtype: int64

In [8]:
X=X.fillna(X.mean())
X.isna().sum()

age    0
sex    0
bmi    0
bp     0
s1     0
s2     0
s3     0
s4     0
s5     0
s6     0
dtype: int64

In [9]:
X, y = load_diabetes(return_X_y=True, as_frame=True)
X.iloc[np.random.randint(0,X.shape[0],10),np.random.randint(0,X.shape[1],3)]=np.nan
print(X.isna().sum())

age     0
sex    10
bmi     0
bp      0
s1     10
s2      0
s3      0
s4     10
s5      0
s6      0
dtype: int64


In [10]:
X_mean=X.mean()

In [38]:
X_mean

age   -2.511817e-19
sex   -2.905421e-04
bmi   -2.245564e-16
bp    -4.797570e-17
s1     1.223156e-04
s2     3.918434e-17
s3    -5.777179e-18
s4    -5.464707e-04
s5     9.293722e-17
s6     1.130318e-17
dtype: float64

In [11]:
imputer=SimpleImputer()
imputer.fit(X)
X=imputer.transform(X)
np.sum(X==np.nan)

0

In [39]:
Y=np.full_like(X, np.nan)
Y

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [40]:
Y=imputer.transform(Y)
Y



array([[-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17],
       [-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17],
       [-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17],
       ...,
       [-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17],
       [-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17],
       [-2.51181680e-19, -2.90542052e-04, -2.24556422e-16, ...,
        -5.46470690e-04,  9.29372215e-17,  1.13031756e-17]])

In [41]:
Y-np.array(X_mean)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## [4] 스케일링

In [43]:
import numpy as np
from sklearn.model_selection import train_test_split

X, y = np.arange(120).reshape((30, 4)), list(range(30))
y


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)         #transform을 진행하기위해 fit을 통해 주어진 데이터를 통해 평균과, 분산을 계산해야함

X_test_scaled = scaler.transform(X_test)  #스케일링
X_test_scaled[:5, :]

array([[-1.10387637, -1.10387637, -1.10387637, -1.10387637],
       [-0.74391669, -0.74391669, -0.74391669, -0.74391669],
       [-1.46383606, -1.46383606, -1.46383606, -1.46383606],
       [-1.82379575, -1.82379575, -1.82379575, -1.82379575],
       [ 1.41584143,  1.41584143,  1.41584143,  1.41584143]])

In [46]:
X, y = np.arange(120).reshape((30, 4)), list(range(30))
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33,random_state=1234)
X_train_scaled= scaler.fit_transform(X_train) 
X_train_scaled

array([[ 0.69592206,  0.69592206,  0.69592206,  0.69592206],
       [ 0.93589519,  0.93589519,  0.93589519,  0.93589519],
       [-1.94378231, -1.94378231, -1.94378231, -1.94378231],
       [-1.70380918, -1.70380918, -1.70380918, -1.70380918],
       [ 1.29585487,  1.29585487,  1.29585487,  1.29585487],
       [ 1.17586831,  1.17586831,  1.17586831,  1.17586831],
       [ 0.21597581,  0.21597581,  0.21597581,  0.21597581],
       [-1.3438495 , -1.3438495 , -1.3438495 , -1.3438495 ],
       [-0.02399731, -0.02399731, -0.02399731, -0.02399731],
       [ 1.05588175,  1.05588175,  1.05588175,  1.05588175],
       [-0.62393012, -0.62393012, -0.62393012, -0.62393012],
       [-0.86390325, -0.86390325, -0.86390325, -0.86390325],
       [ 0.09598925,  0.09598925,  0.09598925,  0.09598925],
       [ 1.535828  ,  1.535828  ,  1.535828  ,  1.535828  ],
       [ 0.45594894,  0.45594894,  0.45594894,  0.45594894],
       [-0.50394356, -0.50394356, -0.50394356, -0.50394356],
       [ 0.5759355 ,  0.

## [5] LabelEncoder

In [16]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
tips = pd.read_csv("examples/tips.csv")

tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,0,3,1,2
1,10.34,1.66,0,3,1,3
2,21.01,3.5,0,3,1,3
3,23.68,3.31,0,3,1,2
4,24.59,3.61,0,3,1,4


In [48]:
import pandas as pd

tips = pd.read_csv("examples/tips.csv")
tips


Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.50,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4
...,...,...,...,...,...,...
239,29.03,5.92,No,Sat,Dinner,3
240,27.18,2.00,Yes,Sat,Dinner,2
241,22.67,2.00,Yes,Sat,Dinner,2
242,17.82,1.75,No,Sat,Dinner,2


In [49]:
encoder=LabelEncoder()
encoder.fit(tips.smoker)
tips.smoker=encoder.transform(tips.smoker)
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,0,Sun,Dinner,2
1,10.34,1.66,0,Sun,Dinner,3
2,21.01,3.5,0,Sun,Dinner,3
3,23.68,3.31,0,Sun,Dinner,2
4,24.59,3.61,0,Sun,Dinner,4


In [18]:
encoder.classes_

array(['No', 'Yes'], dtype=object)

In [19]:
encoder.inverse_transform([1,1,1,1,1,1,1])

array(['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes'], dtype=object)

In [20]:
encoder2=LabelEncoder()
encoder2.fit(tips.day)
tips.day=encoder2.transform(tips.day)
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,0,2,Dinner,2
1,10.34,1.66,0,2,Dinner,3
2,21.01,3.5,0,2,Dinner,3
3,23.68,3.31,0,2,Dinner,2
4,24.59,3.61,0,2,Dinner,4


In [21]:
encoder2.classes_

array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object)

In [22]:
encoder3=LabelEncoder()
encoder3.fit(tips.time)
tips.time=encoder3.transform(tips.time)
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,0,2,0,2
1,10.34,1.66,0,2,0,3
2,21.01,3.5,0,2,0,3
3,23.68,3.31,0,2,0,2
4,24.59,3.61,0,2,0,4


## [6] OneHotEncoder

In [23]:
import pandas as pd

tips = pd.read_csv("examples/tips.csv")
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [24]:
pd.get_dummies(tips.day,dtype='int')

Unnamed: 0,Fri,Sat,Sun,Thur
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
239,0,1,0,0
240,0,1,0,0
241,0,1,0,0
242,0,1,0,0


In [25]:
encoder=OneHotEncoder(sparse_output=False)  #sparse_output을 False로 해야 원하는 대로 얻을 수 있음, True로 하면 matrix꼴로 나오게 됌
encoder.fit(tips.day.values.reshape(-1,1))  #Series의 value만 갖고, reshape을 해줘야 함
encoder.transform(tips.day.values.reshape(-1,1))

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],


In [26]:
encoder.inverse_transform([[1,0,0,0],[0,1,0,0]])

array([['Fri'],
       ['Sat']], dtype=object)

## [7] 파이프라인 적용

### tips 데이터 파이프라인 적용 

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [52]:
tips = pd.read_csv("examples/tips.csv")
tips.tip=pd.cut(tips['tip'],bins=3,labels=['low','midde','high'])
tips['smoker']=tips['smoker'].map({'No':0,"Yes":1})
tips['day']=tips['day'].map({'Thur':0,"Fri":1,"Sat":2,"Sun":3})
tips['time']=tips['time'].map({'Lunch':0,"Dinner":1})


X_train, X_test, y_train, y_test = train_test_split(tips.drop(columns='tip'), tips['tip'],test_size=0.2,random_state=1234)

In [53]:
tips.iloc[0, 0] = np.nan
tips.iloc[1, 0] = np.nan
tips.iloc[2, 2] = np.nan
tips.iloc[3, 4] = np.nan
tips.iloc[4, 3] = np.nan
tips.iloc[5, 1] = np.nan
tips.iloc[5, 2] = np.nan

In [54]:
pipeline=Pipeline(steps=[('imputer',SimpleImputer()),('scaler',StandardScaler()),('basemodel',LogisticRegression(penalty='l2',random_state=1234, max_iter=10000,solver='sag'))])


In [55]:
pipeline.fit(X_train,y_train)
y_pred_test = pipeline.predict(X_test)
print('테스트 데이터셋 스코어 (정확도):',pipeline.score(X_test,y_test))


테스트 데이터셋 스코어 (정확도): 0.8571428571428571


In [56]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

        high       0.00      0.00      0.00         1
         low       0.88      0.97      0.93        38
       midde       0.71      0.50      0.59        10

    accuracy                           0.86        49
   macro avg       0.53      0.49      0.50        49
weighted avg       0.83      0.86      0.84        49



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## [8] 파이프라인+교차검증

### breast_cancer 로지스틱 회귀 (데이터 전처리 + 하이퍼 파라미터 튜닝 파이프라이닝) 

In [58]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [59]:
l_C=[0.0001, 0.001, 0.01, 0.1 ,1, 10, 100] 

In [60]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X.iloc[0, 0] = np.nan
X.iloc[1, 0] = np.nan
X.iloc[2, 2] = np.nan
X.iloc[3, 4] = np.nan
X.iloc[4, 3] = np.nan
X.iloc[5, 1] = np.nan
X.iloc[5, 2] = np.nan

 


X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=1234)

pipeline=Pipeline(steps=[('imputer',SimpleImputer()),('scaler',StandardScaler()),('basemodel',LogisticRegression(penalty='l2',random_state=1234, max_iter=10000,solver='sag'))])

param_grid={'basemodel__C':l_C}
kfold=KFold(n_splits=4, shuffle=True, random_state=1234)  ## KFold 객체 https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.KFold.html 참고




grid_search=GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold)
grid_search.fit(X_train,y_train)
y_pred_test = grid_search.predict(X_test)



print('최적 정규화 계수:',grid_search.best_params_['basemodel__C'])
print('최적 모델의 교차검증 스코어 (정확도)', grid_search.best_score_)
print('테스트 데이터셋 스코어 (정확도):',grid_search.score(X_test,y_test))



최적 정규화 계수: 0.1
최적 모델의 교차검증 스코어 (정확도) 0.9780701754385964
테스트 데이터셋 스코어 (정확도): 0.9385964912280702


In [35]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score, classification_report

y_pred = grid_search.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.84      0.92        45
           1       0.91      1.00      0.95        69

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



### diabetes 데이터 릿지회귀 

In [36]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


In [37]:
X, y = load_diabetes(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=1234)
X.iloc[0, 0] = np.nan
X.iloc[1, 0] = np.nan
X.iloc[2, 2] = np.nan
X.iloc[3, 4] = np.nan
X.iloc[4, 3] = np.nan
X.iloc[5, 1] = np.nan
X.iloc[5, 2] = np.nan

l_eta0=[0.0001, 0.001, 0.01, 0.1] 
l_alpha=[0.0001, 0.001, 0.01, 0.1 ,1, 10, 100] 



param_grid={'basemodel__alpha':l_alpha, 'basemodel__eta0':l_eta0}
kfold=KFold(n_splits=4, shuffle=True, random_state=1234)  
pipeline=Pipeline(steps=[('imputer',SimpleImputer()),('scaler',MinMaxScaler()),('basemodel',SGDRegressor(penalty='l2',random_state=1234, max_iter=100000))])
grid_search=GridSearchCV(pipeline, param_grid=param_grid, cv=kfold)
grid_search.fit(X_train,y_train)
y_pred_test = grid_search.predict(X_test)

print('최적 러닝레이트:',grid_search.best_params_['basemodel__eta0'])
print('최적 정규화 계수:',grid_search.best_params_['basemodel__alpha'])
print('최적 모델의 교차검증 스코어', grid_search.best_score_)
print('테스트 데이터셋 R2:',grid_search.score(X_test,y_test))

최적 러닝레이트: 0.1
최적 정규화 계수: 0.001
최적 모델의 교차검증 스코어 0.48603775783516373
테스트 데이터셋 R2: 0.47554565473809585


### 실습 2. iris 데이터의 피처와 타겟 사이의 관계를 로지스틱 회귀로 분석하세요.
#### SimpleImputer, StandardScaler와 회귀 객체를 파이프라인으로 만드세요.
#### GridSearchCV와 4-fold CV를 활용하여, 최적의 l_C 중에서 C를 찾으세요. 
#### l_C=[0.0001, 0.001, 0.01, 0.1 ,1, 10, 100] 

### 