# 모듈 불러오기

#### 기본

In [None]:
import numpy as np
import pandas as pd
import joblib

In [None]:
# !conda install numpy 
# !conda install pandas
# !conda install scikit-learn
# !conda install scipy
# !conda install tensorflow
# !conda install matplotlib
# !conda install seaborn

# !pip install xgboost
# !pip install lightgbm
# !pip install catboost
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension
# !pip install hyperopt
# !pip install -U imbalanced-learn
# !pip install missingno

#### 전처리

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

from sklearn import impute
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer

#### 리샘플링

In [None]:
from imblearn.over_sampling import (
    RandomOverSampler, 
    ADASYN, 
    SMOTE
)
from imblearn.under_sampling import (
    RandomUnderSampler, 
    TomekLinks, 
    CondensedNearestNeighbour, 
    OneSidedSelection, 
    EditedNearestNeighbours, 
    NeighbourhoodCleaningRule
)

#### 분석

In [None]:
from scipy.stats import skew, kurtosis
from scipy.stats import ttest_ind, f_oneway, pearsonr, chi2_gen

#### 회귀

In [None]:
from sklearn.linear_model import LinearRegression as RL
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
from catboost import CatBoostRegressor as CBR

from lightgbm import plot_importance as lgbm_plot_importance
from xgboost import plot_importance as xgb_plot_importance
from catboost import Pool

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

#### 분류

In [None]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC
from catboost import CatBoostClassifier as CBC

from sklearn.metrics import confusion_matrix as cmatrix
from sklearn.metrics import classification_report as creport
from sklearn.metrics import recall_score as recall
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

#### 교차검증

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from catboost import cv
import hyperopt

from sklearn.model_selection import (
    StratifiedKFold, # 분류
    KFold, # 회귀
    # GroupKFold, 
    # RepeatedKFold, 
    # StratifiedGroupKFold, 
    # RepeatedStratifiedKFold
)

#### 시각화

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rc('font', family='Malgun Gothic')
sns.set(font="Malgun Gothic",
        rc={"axes.unicode_minus":False}, # 마이너스 부호 깨짐 현상 해결
        style='darkgrid')  

import warnings
warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format = 'retina'

# 데이터 로드

In [None]:
folder_name = '1th'

x_train = pd.read_csv(f'./{folder_name}/x_train.csv', sep=',', encoding='utf-8')
y_train = pd.read_csv(f'./{folder_name}/y_train.csv', sep=',', encoding='utf-8')
x_test = pd.read_csv(f'./{folder_name}/x_test.csv', sep=',', encoding='utf-8')
y_test = pd.read_csv(f'./{folder_name}/y_test.csv', sep=',', encoding='utf-8')
x_final = pd.read_csv(f'./{folder_name}/test_x.csv', sep=',', encoding='utf-8')

# 데이터 전처리

#### x, y 분리

In [None]:
target='타겟컬럼명'

x = df.drop(target, axis=1)
y = df.loc[:, target]

#### train, val, test 데이터 분리

In [None]:
# stratify: y 데이터가 알맞은 비율로 분할되게 맞춰준다.
x_train, x_test, y_train, y_test =\
    train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train, x_val, y_train, y_val =\
    train_test_split(x_train, y_train, test_size=0.2, random_state=1)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

#### 로그 스케일링

In [None]:
for df in [x_train,x_test]:
    df['특정컬럼1'] = np.log1p(df['특정컬럼1'] + 1) # + 1 은 0인 경우 처리 불가

#### 결측치 분석

In [None]:
df.isna().sum() # 결측값 컬럼별 개수 파악

In [None]:
df.isna().mean() * 100 # 결측값 비율 파악

In [None]:
import missingno as msno

# 컬럼별 결측치 위치 시각화
ax = msno.matrix(df)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(16, 6))
(1 - df.isna().mean()).abs().plot.bar(ax=ax)

# 이미지 파일 저장
# from time import time, localtime
# today = localtime(time())
# fig.savefig(f'images/mlpr_{today.tm_mon}{today.tm_mday}.png', dpi=300)

#### 결측치 대치

In [None]:
target = '컬럼명'
imputer = impute.IterativeImputer(
                                    missing_values = np.nan,    # 찾을 값 (결측값)
                                    initial_strategy = 'mean',  # 대치할 방법, 'most_frequent'
                                    verbose = 0
                                    )
imputed = imputer.fit_transform(x_train[target])
x_train.loc[:, target] = imputed
imputed = imputer.fit_transform(x_test[target])
x_test.loc[:, target] = imputed

In [None]:
drop_cols = ['컬럼명1', '컬럼명2']
df = df.drop(columns=drop_cols, inplace=True)
df = df.dropna()
df = df.dropna(axis=1)

In [None]:
knn_imputer = KNNImputer(n_neighbors=5)
x_test_filled = knn_imputer.fit_transform(x_test)
x_test_filled = pd.DataFrame(x_test_filled, columns=x_test.columns)
x_test_filled.tail()

#### 결측치가 있었다는 신호를 주는 컬럼 생성

In [None]:
def add_missing_indicator(col):
    def wrapper(df):
        return df[col].isna().astype(int)
    return wrapper

In [None]:
df = df.assign(컬럼명_missing=add_missing_indicator('컬럼명'))

#### 열 이름 수정

In [None]:
# 스네이크 케이스로 컬럼명 정리 (그러나 앞뒤 공백은 처리 못해줌)
import janitor as jn
jn.clean_names(df) 

In [None]:
# 컬럼명의 앞뒤 공백 제거 및 스네이크 케이스로 정리
def clean_col(name):
    return name.strip().lower().replace(' ', '_')

df.rename(columns=clean_col)

#### skew 높은 값 로그 스케일링

In [None]:
# object가 아닌 숫자형 피쳐의 컬럼 index 객체 추출.
features_index = df.dtypes[df.dtypes != 'object'].index
# df에 컬럼 index를 [ ]로 입력하면 해당하는 컬럼 데이터 셋 반환. apply lambda로 skew( )호출 
skew_features = df[features_index].apply(lambda x : skew(x))
# skew 정도가 1 이상인 컬럼들만 추출. 
skew_features_top = skew_features[skew_features > 1]
print(skew_features_top.sort_values(ascending=False))

In [None]:
df[skew_features_top.index] = np.log1p(df[skew_features_top.index])

#### 이상치 제거 IQR

In [None]:
# np.percentile(df[col].values, 100)
# np.max(df[col].values)

def get_outlier(df, col, weight=1.5):
    tmp = df[col]
    quantile_25 = np.percentile(tmp.values, 25)
    quantile_75 = np.percentile(tmp.values, 75)

    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    
    low_outlier = quantile_25 - iqr_weight
    high_outlier = quantile_75 + iqr_weight
    
    low_outlier_index = tmp[tmp < low_outlier].index
    high_outlier_index = tmp[tmp > high_outlier].index

    return [low_outlier_index, high_outlier_index]

#### 리샘플링

In [None]:
# https://datascienceschool.net/03%20machine%20learning/14.02%20%EB%B9%84%EB%8C%80%EC%B9%AD%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EB%AC%B8%EC%A0%9C.html

##### 오버 샘플링
- RandomOverSampler: random sampler
- ADASYN: Adaptive Synthetic Sampling Approach for Imbalanced Learning
- SMOTE: Synthetic Minority Over-sampling Technique

In [None]:
ros = RandomOverSampler(random_state=1)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)

In [None]:
ada = ADASYN(random_state=1)
x_train_resampled, y_train_resampled = ada.fit_resample(x_train, y_train)

In [None]:
smote = SMOTE(random_state=1) # SMOTE 객체 생성
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

##### 언더 샘플링 (복원추출 사용하면 안됨)
- RandomUnderSampler: random under-sampling method
- TomekLinks: Tomek’s link method
- CondensedNearestNeighbour: condensed nearest neighbour method
- OneSidedSelection: under-sampling based on one-sided selection method
- EditedNearestNeighbours: edited nearest neighbour method
- NeighbourhoodCleaningRule: neighbourhood cleaning rule

In [None]:
smote = SMOTE(random_state=1) # SMOTE 객체 생성
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
smote = SMOTE(random_state=1) # SMOTE 객체 생성
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
smote = SMOTE(random_state=1) # SMOTE 객체 생성
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

#### 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 정규화
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train= scaler.transform(x_train)
x_test= scaler.transform(x_test)

In [None]:
# min-max scaling
max_n, min_n = x_train.max(), x_train.min()

In [None]:
x_train = (x_train - min_n) / (max_n - min_n)
x_test = (x_test - min_n) / (max_n - min_n)

#### PFI (Permutation Feature Importance)
- 모델과 상관없이 피처 중요도를 얻을 수 있다.
- 다중공선성(선형회귀, 로지스틱회귀에 중요)이 있으면 효과 없음

In [None]:
pfi1 = permutation_importance(model1, x_val_s, y_val, n_repeats=10, 
                              scoring = 'r2', random_state=20)

In [None]:
sorted_idx = pfi1.importances_mean.argsort()
plt.figure(figsize = (10, 8))
plt.boxplot(pfi1.importances[sorted_idx].T, vert=False, labels=x.columns[sorted_idx])
plt.axvline(0, color = 'r')
plt.grid()
plt.show()

# 데이터 저장

In [None]:
folder_name = '1th'

x_train.to_csv(f'./{folder_name}/x_train.csv', index = False)
y_train.to_csv(f'./{folder_name}/y_train.csv', index = False)
x_test.to_csv(f'./{folder_name}/x_test.csv', index = False)
y_test.to_csv(f'./{folder_name}/y_test.csv', index = False)
y_test.to_csv(f'./{folder_name}/x_final.csv', index = False)