In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 결측치 대체 알고리즘

## 0) DataFrame.fillna() 활용
* DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)
    * parameter
        * value : scalar, dict, Series, or DataFrame
        * method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None

## 1) SimpleImputer
* 한 특성의 통계 값을 사용하여 결측치를 대체
* 주요 parameter
    * missing_values : int, float, str, np.nan, None or pandas.NA, default=np.nan
    * strategy : str {'mean', 'medain', 'most_frequent', 'constant'} default=’mean’
    * fill_value : str or numerical value, default=None
        * strategy = 'constant'인 경우 지정
    * copybool, default=True

In [2]:
from sklearn.impute import SimpleImputer

df=pd.DataFrame(np.random.randn(10,8),columns=list('01234567'))

indices=[[1,2],[1,6],[1,7],[4,5],[4,6],[5,6],[7,2],[8,4],[9,4]]

for index in indices:
    df.iloc[index[0],index[1]]=np.nan

df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.051877,0.177566,-1.908825,-0.197848,-0.712544,-0.465912,0.928295,0.353415
1,0.508283,0.39208,,1.284454,-0.22897,0.93198,,
2,0.578315,0.521501,-0.53671,-0.290153,-1.810798,-0.091317,-1.439049,0.317503
3,0.434185,0.788339,0.456843,0.306642,1.008544,-0.891753,0.35971,1.142842
4,0.492746,-0.594298,-0.256867,0.937637,1.488651,,,-2.535786
5,0.40868,0.313152,1.382635,-1.556503,1.525769,-2.293575,,-1.315476
6,0.847734,-0.364045,0.723377,-0.274852,-0.429604,0.008002,0.144875,0.929042
7,-0.188939,-0.275927,,-0.65661,1.020099,1.28613,0.450713,1.417044
8,1.251551,0.942231,-0.718525,0.402444,,-0.443837,-0.817411,1.764483
9,-2.246375,-1.425264,0.735784,-1.058062,,0.643453,0.055378,-0.350576


In [3]:
df_tmp=df
imputer = SimpleImputer(strategy='mean')
df_simple=imputer.fit_transform(df_tmp)
df_simple=pd.DataFrame(df_simple,columns=df.columns)
df_simple

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.051877,0.177566,-1.908825,-0.197848,-0.712544,-0.465912,0.928295,0.353415
1,0.508283,0.39208,-0.015286,1.284454,-0.22897,0.93198,-0.045356,0.191388
2,0.578315,0.521501,-0.53671,-0.290153,-1.810798,-0.091317,-1.439049,0.317503
3,0.434185,0.788339,0.456843,0.306642,1.008544,-0.891753,0.35971,1.142842
4,0.492746,-0.594298,-0.256867,0.937637,1.488651,-0.146314,-0.045356,-2.535786
5,0.40868,0.313152,1.382635,-1.556503,1.525769,-2.293575,-0.045356,-1.315476
6,0.847734,-0.364045,0.723377,-0.274852,-0.429604,0.008002,0.144875,0.929042
7,-0.188939,-0.275927,-0.015286,-0.65661,1.020099,1.28613,0.450713,1.417044
8,1.251551,0.942231,-0.718525,0.402444,0.232643,-0.443837,-0.817411,1.764483
9,-2.246375,-1.425264,0.735784,-1.058062,0.232643,0.643453,0.055378,-0.350576


## 2) IterativeImputer
* 다른 특성을 통해 예측하여 결측치를 대체
* ```IterativeImputer(estimator=None, *, missing_values=nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=None, initial_strategy='mean', imputation_order='ascending', skip_complete=False, min_value=-inf, max_value=inf, verbose=0, random_state=None, add_indicator=False)```
* parameter 많음...^^
* **IterativeImputer 클래스는 아직 실험적이기 때문에 import enable_iterative_imputer 필요**

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp_mean = IterativeImputer(random_state=0)
imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])

X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
imp_mean.transform(X)

array([[ 6.95847623,  2.        ,  3.        ],
       [ 4.        ,  2.6000004 ,  6.        ],
       [10.        ,  4.99999933,  9.        ]])

## 3) MICE(Multiple Imputation by Chained Equation)

<img src="./img/mice.PNG" width="700" height="500">

* MICE는 연쇄 등식을 이용한 다중대치로, 과정은 아래와 같다
    * 1. 결측치를 다른 모든 변수를 사용하여 예측
    * 2. 모든 결측치를 채운 데이터 셋을 m(=3)개에 대해 with()를 사용하여 통계모형을 적용
    * 3. pool()을 사용하여 분석결과를 하나로 통합

In [7]:
# !pip install impyute

In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.051877,0.177566,-1.908825,-0.197848,-0.712544,-0.465912,0.928295,0.353415
1,0.508283,0.39208,,1.284454,-0.22897,0.93198,,
2,0.578315,0.521501,-0.53671,-0.290153,-1.810798,-0.091317,-1.439049,0.317503
3,0.434185,0.788339,0.456843,0.306642,1.008544,-0.891753,0.35971,1.142842
4,0.492746,-0.594298,-0.256867,0.937637,1.488651,,,-2.535786
5,0.40868,0.313152,1.382635,-1.556503,1.525769,-2.293575,,-1.315476
6,0.847734,-0.364045,0.723377,-0.274852,-0.429604,0.008002,0.144875,0.929042
7,-0.188939,-0.275927,,-0.65661,1.020099,1.28613,0.450713,1.417044
8,1.251551,0.942231,-0.718525,0.402444,,-0.443837,-0.817411,1.764483
9,-2.246375,-1.425264,0.735784,-1.058062,,0.643453,0.055378,-0.350576


In [9]:
from impyute.imputation.cs import mice

df_tmp=df
df_mice=mice(df_tmp.values)
df_mice=pd.DataFrame(df_mice)

df_mice

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.051877,0.177566,-1.908825,-0.197848,-0.712544,-0.465912,0.928295,0.353415
1,0.508283,0.39208,-0.015286,1.284454,-0.22897,0.93198,-8.712757,2.612122
2,0.578315,0.521501,-0.53671,-0.290153,-1.810798,-0.091317,-1.439049,0.317503
3,0.434185,0.788339,0.456843,0.306642,1.008544,-0.891753,0.35971,1.142842
4,0.492746,-0.594298,-0.256867,0.937637,1.488651,-0.146314,12.215642,-2.535786
5,0.40868,0.313152,1.382635,-1.556503,1.525769,-2.293575,12.74678,-1.315476
6,0.847734,-0.364045,0.723377,-0.274852,-0.429604,0.008002,0.144875,0.929042
7,-0.188939,-0.275927,-0.015286,-0.65661,1.020099,1.28613,0.450713,1.417044
8,1.251551,0.942231,-0.718525,0.402444,0.232643,-0.443837,-0.817411,1.764483
9,-2.246375,-1.425264,0.735784,-1.058062,0.232643,0.643453,0.055378,-0.350576


## 4) KNNImputer
* KNN을 활용하여 결측치를 처리하는 알고리즘
* parameters
    * missing_values : int, float, str, np.nan or None, default=np.nan
    * n_neighbors : int, default=5
    * weights : {‘uniform’, ‘distance’} or callable, default=’uniform’
    * metric : {‘nan_euclidean’} or callable, default=’nan_euclidean’
    * copy : bool, default=True

In [10]:
from sklearn.impute import KNNImputer

In [11]:
df=pd.DataFrame(np.random.randn(10,10),columns=list('0123456789'))

indices=[[1,2],[1,6],[1,7],[4,6],[5,6],[8,4],[7,2],[9,9]]

for index in indices:
    df.iloc[index[0],index[1]]=np.nan

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.554429,-1.332588,-0.515174,0.322426,-0.35017,1.52594,0.383762,-0.19448,0.676047,-0.206176
1,1.683518,-0.120954,,-1.51643,0.423903,1.456328,,,-0.175874,-1.064695
2,1.334687,-0.364129,-1.755069,0.014619,1.087764,1.248069,-1.263989,-0.746976,0.520227,0.450508
3,0.478051,0.02186,-0.468999,-0.303238,-2.316239,-0.943462,-1.563846,-1.180564,1.644476,-0.115922
4,1.804811,-0.154562,0.60911,-0.197737,-0.182005,-0.123589,,-3.371816,-0.020058,0.951314
5,-0.881251,-0.979375,-0.739429,-0.339337,0.702966,-0.881325,,-0.536709,-0.278573,0.592155
6,0.634576,0.552915,-0.20228,0.863491,1.102989,-0.132172,-1.444918,-0.21073,0.380222,-0.191747
7,-0.009057,-0.694542,,-0.323285,0.566963,0.662263,-0.728739,1.082072,-1.017473,1.378802
8,1.474123,-1.281768,0.284401,-0.058296,,0.958816,-1.391558,-0.12026,-1.004555,-0.963299
9,1.524586,-0.027971,-1.695546,-0.484576,-0.199943,2.559832,-1.686763,-1.166181,0.692104,


In [12]:
df_tmp=df
imputer = KNNImputer(n_neighbors=4)
df_knn=imputer.fit_transform(df_tmp)
df_knn=pd.DataFrame(df_knn,columns=df.columns)
df_knn

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.554429,-1.332588,-0.515174,0.322426,-0.35017,1.52594,0.383762,-0.19448,0.676047,-0.206176
1,1.683518,-0.120954,-0.639276,-1.51643,0.423903,1.456328,-1.446807,-1.351308,-0.175874,-1.064695
2,1.334687,-0.364129,-1.755069,0.014619,1.087764,1.248069,-1.263989,-0.746976,0.520227,0.450508
3,0.478051,0.02186,-0.468999,-0.303238,-2.316239,-0.943462,-1.563846,-1.180564,1.644476,-0.115922
4,1.804811,-0.154562,0.60911,-0.197737,-0.182005,-0.123589,-1.416078,-3.371816,-0.020058,0.951314
5,-0.881251,-0.979375,-0.739429,-0.339337,0.702966,-0.881325,-0.763471,-0.536709,-0.278573,0.592155
6,0.634576,0.552915,-0.20228,0.863491,1.102989,-0.132172,-1.444918,-0.21073,0.380222,-0.191747
7,-0.009057,-0.694542,-0.802988,-0.323285,0.566963,0.662263,-0.728739,1.082072,-1.017473,1.378802
8,1.474123,-1.281768,0.284401,-0.058296,0.795405,0.958816,-1.391558,-0.12026,-1.004555,-0.963299
9,1.524586,-0.027971,-1.695546,-0.484576,-0.199943,2.559832,-1.686763,-1.166181,0.692104,-0.445915


In [13]:
# !pip install missingpy

In [None]:
# from missingpy import KNNImputer
# imputer = KNNImputer()
# df_missforest = imputer.fit_transform(X)
# df_missforest=pd.DataFrame(df_missforest)
# df_missforest

## [5] MissForest
* Random Forest를 활용하여 결측치를 처리하는 알고리즘에서 
* KNN Imputer보다 성능이 우수함 

In [None]:
# # !pip install missingpy

In [14]:
# from missingpy import MissForest

# imputer = MissForest()
# df_missforest = imputer.fit_transform(X)
# df_missforest=pd.DataFrame(df_missforest)
# df_missforest

# 2. 다양한 회귀모델

 * Linear Regression, KNeighborsRegressor, DecisionTreeRegressor, RandomForestRegressor 외
 * 선형회귀 모델인 Lasso, Ridge, ElasticNet 
 * Boosting 기법을 활용한 모델인 AdaBoost / GBM(Gradient Boosting Machine) / XGBoost / LightGBM / CatBoost


In [15]:
# !pip install xgboost
# !pip install lightgbm
# !pip install catboost

In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [8]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import * 

boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['target']=boston.target

df.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9


In [9]:
x=df.drop('target', axis=1)
y=df['target']

x_train, x_val, y_train, y_val = train_test_split(x,y,test_size=0.3)

scaler = MinMaxScaler()
x_train_s = scaler.fit_transform(x_train)
x_val_s = scaler.transform(x_val)

x_train_s = pd.DataFrame(x_train_s, columns = list(x_train))
x_val_s = pd.DataFrame(x_val_s, columns = list(x_val))

## [0] 선형 회귀 분석의 4가지 가정
* 선형 회귀 분석은 아래 4가지 가정이 충족되어야 적절하다고 판단할 수 있다.

* 1) 선형성
    * 종속변수와 독립변수 간의 선형관계

* 2) 독립성
    * 독립 변수들 간의 통계적 독립성( 비 다중공선성 ) 

*  3) 등분산성
    * 잔차들의 분산이 일정

* 4) 정규성
    * 잔차들의 분포가 정규분포를 이룸

## [0] OLS(Ordinary Least Square)
* 최소 제곱 선형 회구 모델 구현
* 비용함수로는 제곱 오차합(SSE) 사용

<img src="./img/sse.PNG" width="200" height="150">

## [1] 선형 회귀 모델
* Lasso와 Ridge는 Linear Regression의 단점을 보완환 모델   
* 규제 : 부가 정보를 손실에 더해 과대적합 문제를 방지하는 방법으로, 복잡도에 대한 패널티를 유도하여 모델 파라미터의 값을 감소시킴.
* 규제 강도는 lambda에 해당하는 alpha 매개변수로 조절

### 1) Ridge Regression
* 릿지 회귀는 최소 제곱 비용함수에 가중치의 제곱합을 추가한 L2 규제 모델
* ```ridge = Ridge(alpha=1.0)```
<img src="./img/ridge.PNG" width="500" height="300">

### 2) LASSO(Least Absolute Shrinkage and Selection Operator)
* 라쏘는 최소 제곱 비용함수에 가중치의 크기를 추가한 L1 규제 모델로 희소한 모델을 만들 수 있다
* m>n이면 최대 n개의 특성을 선택하는 것이 한계
* ```lasso = Lasso(alpha=1.0)```
<img src="./img/lasso.PNG" width="500" height="300">

### 3)ElasticNet
* 릿지 회귀와 라쏘의 절충안
* ```elanet = ElasticNet(alpha=1.0, l1_ratio=0.5)```
* l1_ratio=1 이면 LASSO와 동일하지만, l1_ratio=0 이면 Ridge와 동일하지 않음
* 왜냐면, 𝜆1=alpha * l1_ratio & 𝜆2=alpha * (1-l1_ratio)/2 이기 때문
<img src="./img/elastic.PNG" width="500" height="300">


## [2]  Ensemble 
* 여러 개의 약한 분류기를 생성하고 학습시킨 뒤, 그 학습 결과를 결합으로써 과적합을 방지하고 보다 정확한 예측을 하는 기법

## [2-1]  Ensemble - Bagging(Bootstrap sample + Aggregating
* 앙상블에 있는 개별 분류기를 동일한 train data로 학습하는 것이 아니라 원본 train data에서 부트스트랩 샘플(중복을 허용한 랜덤 샘플)을 뽑아서 사용
* ex) Random Forest

In [10]:
m1=BaggingRegressor()
m1.fit(x_train, y_train)
p1=m1.predict(x_val)

print(f'RMES : {mean_squared_error(y_val, p1, squared=False)}')
print(f'MAE : {mean_absolute_error(y_val,p1)}')
print(f'MAPE : {mean_absolute_percentage_error(y_val,p1)}')

RMES : 5.321741095682285
MAE : 2.8742763157894737
MAPE : 0.14412295997533547


## [2-2]  Ensemble - Boosting
* 여러 개의 모델을 순차적으로 학습 및 예측하면서 잘못 예측한 데이터에 가중치를 부여해 오류를 개선해나가는 학습 방식
* 중복을 허용하지 않고 train data에서 랜덤 샘플을 추출하여 부분집합을 구성
* 유명한 AdaBoosts는 약한 학습기를 훈련할 때 훈련 세트를 전체 사용

In [11]:
m2_1=AdaBoostRegressor()
m2_1.fit(x_train, y_train)
p2_1=m2_1.predict(x_val)

print(f'RMES : {mean_squared_error(y_val, p2_1, squared=False)}')
print(f'MAE : {mean_absolute_error(y_val,p2_1)}')
print(f'MAPE : {mean_absolute_percentage_error(y_val,p2_1)}')

RMES : 4.993294268106011
MAE : 2.976678191328267
MAPE : 0.14554974450956065


In [12]:
m2_2=GradientBoostingRegressor()
m2_2.fit(x_train, y_train)
p2_2=m2_2.predict(x_val)

print(f'RMES : {mean_squared_error(y_val, p2_2, squared=False)}')
print(f'MAE : {mean_absolute_error(y_val,p2_2)}')
print(f'MAPE : {mean_absolute_percentage_error(y_val,p2_2)}')

RMES : 4.306161883606683
MAE : 2.496650588967645
MAPE : 0.12496382728019398


## [2-3] Ensemble - Stacking
* 개별 분류기가 예측한 데이터를 다시 train data로 새로운 분류기가 다시 예측을 수행하는 기법

In [15]:
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR

estimators=[('ridge', Ridge()), ('lasso', Lasso())]
m3=StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor())
m3.fit(x_train, y_train)
p3=m3.predict(x_val)

print(f'RMES : {mean_squared_error(y_val, p3, squared=False)}')
print(f'MAE : {mean_absolute_error(y_val,p3)}')
print(f'MAPE : {mean_absolute_percentage_error(y_val,p3)}')

RMES : 5.456951780696081
MAE : 3.294065789473685
MAPE : 0.15620802989695418


## [2-4] Ensemble - Voting
* 여러 모델을 통해 얻은 예측 결과들로부터 다수결 투표를 하여 최종 결과를 예측
* 하드 보팅 & 소프트 보팅
    * 하드 보팅 : 예측 결과 중 다수 선택된 값을 최종 결과로 선정
    * 소프트 보팅 : 각 분류기별 결정 확률을 평균내어 얻은 확률이 가장 높은 값을 최종 결과로 선정

In [17]:
model1=LinearRegression()
model2=RandomForestRegressor()
model3=KNeighborsRegressor()

m4=VotingRegressor([('lr',model1),('rf',model2), ('knn',model3)])
m4.fit(x_train, y_train)
p4=m4.predict(x_val)

print(f'RMES : {mean_squared_error(y_val, p4, squared=False)}')
print(f'MAE : {mean_absolute_error(y_val,p4)}')
print(f'MAPE : {mean_absolute_percentage_error(y_val,p4)}')

RMES : 5.003572295409232
MAE : 3.131906070227881
MAPE : 0.14186888480421894


# 3. 다양한 분류모델
* 회귀모델과 비슷 :)

# 4. autoML

# +. ETC

## [1] 모델 저장 

!pip install joblib   
joblib.dump( 'model' , '경로')

## [2] Pandas Profiling - EDA

In [1]:
# pip install pandas-profiling
# pip install markupsafe==2.0.1

In [5]:
from pandas_profiling import ProfileReport

profile=ProfileReport(df, title='boston Report')

profile.to_file('boston Report.html')

Summarize dataset:   0%|          | 0/28 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  cmap.set_bad(cmap_bad)


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]