# 패키지 불러오기

In [6]:
# 데이터 분석 라이브러리
import numpy as np
import pandas as pd

# 시각화 라이브러리
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from plotnine import *

# 모델링 라이브러리
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer, LabelEncoder
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV, RandomizedSearchCV, train_test_split
from category_encoders.ordinal import OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from xgboost import XGBRFClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, make_scorer, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from pycaret.classification import *
import sklearn.metrics as metrics

# 샘플링 라이브러리
from imblearn.over_sampling import *
from sklearn.model_selection import train_test_split



# 기타 라이브러리
from tqdm import tqdm_notebook, tqdm
from datetime import datetime
import warnings
import random
import gc
import os

warnings.filterwarnings("ignore", category=RuntimeWarning)        
sns.set_style("whitegrid")

## 데이터 불러오기

In [7]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

## 전처리

- ID 칼럼은 분석에 필요한 데이터가 아니므로 train, test 데이터에서 ID 칼럼을 삭제해줍니다.

In [None]:
plt.figure(figsize = (16,8))
sns.countplot('age', hue = 'income', data = train)

In [11]:
#특정 컬럼에서 특정항목값의 수익이 높은 사람의 수 / 특정컬럼에서 특정항목값 총 사람의 수 => 수입이 높은 사람의 비율
def portion_income(column):
    train[column] = train[column].astype('str')
    col_lst = train[column].unique().tolist()
    df = train[[column,'income']]

    for x in tqdm_notebook(col_lst):
        locals()["high" + '_' + x] = df[(df[column] == x) & (df.income == ">50K")]
        locals()["low" + '_' + x]  = df[(df[column] == x) & (df.income == "<=50K")]
    col_name = []
    col_name_value = []
    for x in tqdm_notebook(col_lst):
        col_name.append(x)
        col_name_value.append(len(locals()["high" + '_' + x]) / (len(locals()["high" + '_' + x]) + len(locals()["low" + '_' + x])))
        #print(x , len(locals()["high" + '_' + x]) / (len(locals()["high" + '_' + x]) + len(locals()["low" + '_' + x])))
    data = pd.DataFrame({"name" : col_name, "value" : col_name_value }).sort_values(by = 'value', ascending = False)
    data.columns = [column, '{}_income_up_portion'.format(column)]

    return data

def preprocessing(train, test):
    # drop columns 
    train.drop(['id'], axis=1, inplace=True)
    test.drop(['id'], axis=1, inplace=True)
    
    # 이상치 대치
    test.loc[test['native_country']== 'Holand-Netherlands', 'native_country'] = 'United-States'
    test.loc[test['native_country']== '?', 'native_country'] = 'United-States'
    train.loc[train['native_country']== '?', 'native_country'] = 'United-States'
    
    # Work_Portion
    workclass = portion_income('workclass')
    pd.merge(train, workclass, on = 'workclass', how = 'left')
    pd.merge(test, workclass, on = 'workclass', how = 'left')
    
    # WhiteMan 가중치
    train['WhiteMan'] = 0
    test['WhiteMan'] = 0
    train.loc[(train['race'] == 'White') & (train['sex'] == 'Male'), 'WhiteMan'] = 3
    test.loc[(test['race'] == 'White') & (test['sex'] == 'Male'), 'WhiteMan'] = 3
    
    # AgeGroup 가중치
    train['AgeGroup'] = 0
    test['AgeGroup'] = 0
    train.loc[(train['age'] >= 34) & (train['age'] <= 52), 'AgeGroup'] = 3
    test.loc[(test['age'] >= 34) & (test['age'] <= 52), 'AgeGroup'] = 3
    
    # y변수 이진화
    train['income'] = train['income'].apply(lambda x: 1 if x == '>50K' else 0)
    
    # y변수 설정
    y = train['income']
    X = train.drop(['income'], axis=1)
    
    print(X.shape, y.shape)
    
    return X, y, test

In [12]:
X, y, test = preprocessing(train, test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))


(26049, 16) (26049,)


## 원핫 인코딩
- 컬럼당 Class 개수 - 1 옵션 찾기

In [13]:
X = pd.get_dummies(X, drop_first = True, columns = ['education'])
test = pd.get_dummies(test, drop_first = True, columns = ['education'])
print(X.shape, test.shape)

(26049, 30) (6512, 30)


## 라벨 인코딩

In [14]:
# 라벨 인코더 생성
LE_encoder = OrdinalEncoder(list(X.columns))

# train, test 데이터에 인코딩 적용
X = LE_encoder.fit_transform(X, y)
test = LE_encoder.transform(test)

## Min-Max Scaler

min_max_scaler = MinMaxScaler()

output = min_max_scaler.fit_transform(X, y)
X = pd.DataFrame(output, columns=X.columns, index=list(X.index.values))
X

## Over Sampling
- __Random Over Sampling__
    - 소수 클래스의 데이터를 반복해서 넣는 것(replacement)이다. 가중치를 증가시키는 것과 비슷하다.
- __ADASYN(Adaptive Synthetic Sampling)__
    - 방법은 소수 클래스 데이터와 그 데이터에서 가장 가까운 k개의 소수 클래스 데이터 중 무작위로 선택된 데이터 사이의 직선상에 가상의 소수 클래스 데이터를 만드는 방법이다.
- __BoarderlineSMOTE__
- __SVM_SMOTE__
- __SMOTE+ENN (Synthetic Minority Over-sampling Technique + Edited Nearest Neighbours__
- __SMOTE+Tomek__

In [None]:
#X, y = BorderlineSMOTE(random_state=0).fit_sample(X, y)
#X, y = RandomOverSampler(random_state=0).fit_sample(X, y)
#X, y = ADASYN(random_state=0).fit_sample(X, y)
#X, y = SVMSMOTE(random_state=0).fit_sample(X, y)
#X, y = SMOTEENN(random_state=0).fit_sample(X, y)
#X, y = SMOTETomek(random_state=0).fit_sample(X, y)
print(X.shape)
print(y.shape)

# Auto-ML

- data: dataframe
- target: string // y 변수
- train_size: float, default = 0.7
- sampling: bool, default = True // True로 할 경우 데이터 수를 최대 25,000개로 한정하여 모델링
When the sample size exceeds 25,000 samples, pycaret will build a base estimator at various sample sizes from the original dataset.
- categorical_features: string, default = None // 범주형 변수를 수치형 변수로 인식하지 않게 지정
- categorical_imputation: string, default = ‘constant’ // 'mode'로 할 경우 결측치를 training의 최빈값으로 대치
- ordinal_features: dictionary, default = None // 순서형 변수는 {'컬럼명' : ['low' < 'medium', 'high'] 순으로 순서 정의} 값을 넣어주면 순서 고려
- high_cardinality_features: string, default = None
- high_cardinality_method: string, default = ‘frequency’
- numeric_features: string, default = None
- numeric_imputation: string, default = ‘mean’
- date_features: string, default = None
- ignore_features: string, default = None
- normalize: bool, default = False
- normalize_method: string, default = ‘zscore’ / 'minmax', 'maxabs' -절대값, 'robust' -1사분위수 ~ 3사분위수(이상치 제외), 'zscore'
- transformation: bool, default = False / 데이터를 정규분포화
- transformation_method: string, default = ‘yeo-johnson’ / 'quantile' -가우시안
- handle_unknown_categorical: bool, default = True / train에서 못본 데이터는 최빈값으로 대치
- unknown_categorical_method: string, default = ‘least_frequent’- 최신값 / 'most_frequent' - 최빈값
- pca: bool, default = False
- pca_method: string, default = ‘linear’ 'kernel', 'incremental'
- pca_components: int/float, default = 0.99 데이터 설명력 
- ignore_low_variance: bool, default = False
- combine_rare_levels: bool, default = False
When set to True, all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level. There must be at least two levels under the threshold for this to take effect. rare_level_threshold represents the percentile distribution of level frequency. Generally, this technique is applied to limit a sparse matrix caused by high numbers of levels in categorical features.
rare_level_threshold: float, default = 0.1
Percentile distribution below which rare categories are combined. Only comes into effect when combine_rare_levels is set to True.
bin_numeric_features: list, default = None
When a list of numeric features is passed they are transformed into categorical features using K Means, where values in each bin have the same nearest center of a 1D k-means cluster. The number of clusters are determined based on the ‘sturges’ method. It is only optimal for gaussian data and underestimates the number of bins for large non-gaussian datasets.
remove_outliers: bool, default = False
When set to True, outliers from the training data are removed using PCA linear dimensionality reduction using the Singular Value Decomposition technique.
outliers_threshold: float, default = 0.05
The percentage / proportion of outliers in the dataset can be defined using the outliers_threshold param. By default, 0.05 is used which means 0.025 of the values on each side of the distribution’s tail are dropped from training data.
remove_multicollinearity: bool, default = False
When set to True, the variables with inter-correlations higher than the threshold defined under the multicollinearity_threshold param are dropped. When two features are highly correlated with each other, the feature that is less correlated with the target variable is dropped.
multicollinearity_threshold: float, default = 0.9
Threshold used for dropping the correlated features. Only comes into effect when remove_multicollinearity is set to True.
remove_perfect_collinearity: bool, default = False
When set to True, perfect collinearity (features with correlation = 1) is removed from the dataset, When two features are 100% correlated, one of it is randomly dropped from the dataset.
create_clusters: bool, default = False
When set to True, an additional feature is created where each instance is assigned to a cluster. The number of clusters is determined using a combination of Calinski-Harabasz and Silhouette criterion.
cluster_iter: int, default = 20
Number of iterations used to create a cluster. Each iteration represents cluster size. Only comes into effect when create_clusters param is set to True.
polynomial_features: bool, default = False
When set to True, new features are created based on all polynomial combinations that exist within the numeric features in a dataset to the degree defined in
polynomial_degree param.
polynomial_degree: int, default = 2
Degree of polynomial features. For example, if an input sample is two dimensional and of the form [a, b], the polynomial features with degree = 2 are: [1, a, b, a^2, ab, b^2].
trigonometry_features: bool, default = False
When set to True, new features are created based on all trigonometric combinations that exist within the numeric features in a dataset to the degree defined in the polynomial_degree param.
polynomial_threshold: float, default = 0.1
This is used to compress a sparse matrix of polynomial and trigonometric features. Polynomial and trigonometric features whose feature importance based on the combination of Random Forest, AdaBoost and Linear correlation falls within the percentile of the defined threshold are kept in the dataset. Remaining features are dropped before further processing.
group_features: list or list of list, default = None
When a dataset contains features that have related characteristics, the group_features param can be used for statistical feature extraction. For example, if a dataset has numeric features that are related with each other (i.e ‘Col1’, ‘Col2’, ‘Col3’), a list containing the column names can be passed under group_features to extract statistical information such as the mean, median, mode and standard deviation.
group_names: list, default = None
When group_features is passed, a name of the group can be passed into the group_names param as a list containing strings. The length of a group_names list must equal to the length of group_features. When the length doesn’t match or the name is not passed, new features are sequentially named such as group_1, group_2 etc.
feature_selection: bool, default = False
When set to True, a subset of features are selected using a combination of various permutation importance techniques including Random Forest, Adaboost and Linear correlation with target variable. The size of the subset is dependent on the feature_selection_param. Generally, this is used to constrain the feature space in order to improve efficiency in modeling. When polynomial_features and feature_interaction are used, it is highly recommended to define the feature_selection_threshold param with a lower value.
feature_selection_threshold: float, default = 0.8
Threshold used for feature selection (including newly created polynomial features). A higher value will result in a higher feature space. It is recommended to do multiple trials with different values of feature_selection_threshold specially in cases where polynomial_features and feature_interaction are used. Setting a very low value may be efficient but could result in under-fitting.
feature_interaction: bool, default = False
When set to True, it will create new features by interacting (a * b) for all numeric variables in the dataset including polynomial and trigonometric features (if created). This feature is not scalable and may not work as expected on datasets with large feature space.
feature_ratio: bool, default = False
When set to True, it will create new features by calculating the ratios (a / b) of all numeric variables in the dataset. This feature is not scalable and may not work as expected on datasets with large feature space.
interaction_threshold: bool, default = 0.01
Similar to polynomial_threshold, It is used to compress a sparse matrix of newly created features through interaction. Features whose importance based on the combination of Random Forest, AdaBoost and Linear correlation falls within the percentile of the defined threshold are kept in the dataset. Remaining features
are dropped before further processing.
- fix_imbalance: bool, default = False / True 시 SMOTE가 기본값
- fix_imbalance_method: obj, default = None / 'smote'(오버샘플링), minority to majority scale - imblearn 패키지의 fit_resample method 모두 지원
- data_split_shuffle: bool, default = True / False 설정 시 데이터 분할할 때 row의 셔플 방지
- folds_shuffle: bool, default = False Cross Validation 시 row의 셔플 방지
- n_jobs: int, default = -1
- html: bool, default = True
- session_id: int, default = None
- log_experiment: bool, default = False
- experiment_name: str, default = None
- log_plots: bool, default = False
- log_profile: bool, default = False
- log_data: bool, default = False // csv형식으로 로그 기록
- silent: bool, default = False
- verbose: Boolean, default = True
- profile: bool, default = False
If set to true, a data profile for Exploratory Data Analysis will be displayed in an interactive HTML report.

In [None]:
X['income'] = y
clf = setup(data = X, target = 'income'
           ,use_gpu = True
           , fold_strategy = 'kfold'
           , train_size = 0.75
            )

In [None]:
best_3 = compare_models(sort = 'F1', n_select = 3)

In [None]:
blended = blend_models(estimator_list = best_3, fold = 5, method = 'soft')

In [None]:
pred_holdout = predict_model(blended)

In [None]:
final_model = finalize_model(blended)

In [None]:
predictions = predict_model(final_model, data = test)

In [None]:
predictions['Label'].value_counts()

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['prediction'] = predictions['Label']
submission.to_csv('./submission/pycaret.csv', index = False)

In [None]:
res = pd.read_csv('./submission/pycaret.csv')
res['prediction'].value_counts()
#.bar()

!kaggle competitions submit -c kakr-4th-competition -f ./submission/pycaret.csv -m "Pycaret"

# 모델링

In [15]:
def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1_score = metrics.f1_score(y_test, pred)
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1_score: {3: 4f}'.format(accuracy , precision ,recall, f1_score))
    

def Find_threshold(model, X_test, y_test):
    thresholds = np.arange(0,1,0.05)
    threshold_count = []
    pred_proba=model.predict_proba(X_test)
    pred_proba_1 = pred_proba[:,1].reshape(-1,1)
    for threshold in thresholds:
        binarizer = Binarizer(threshold=threshold)
        model_cl_pred = binarizer.transform(pred_proba_1)
        threshold_count.append([threshold, metrics.f1_score(model_cl_pred, y_test)])

    MAX=thresholds[np.array(threshold_count)[:,1].argmax()]
    
    thresholds = np.arange(MAX-0.05,MAX+0.05,0.005)
    threshold_count = []
    for threshold in thresholds:
        binarizer = Binarizer(threshold=threshold)
        model_cl_pred = binarizer.transform(pred_proba_1)
        #threshold_count.append([threshold, metrics.f1_score(log_cl_pred, y_test)])
        #print('threshold: {0:.4f}, f1_score: {1:.4f}'.format(threshold,metrics.f1_score(model_cl_pred, y_test)))
        threshold_count.append([threshold, metrics.f1_score(model_cl_pred, y_test)])
    
    return np.round(thresholds[np.array(threshold_count)[:,1].argmax()],4), np.round(metrics.f1_score(Binarizer(threshold=thresholds[np.array(threshold_count)[:,1].argmax()]).transform(pred_proba_1), y_test),4)

def precision_recall_curve_plot(y_test, pred_proba_c1):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출. 
    precisions, recalls, thresholds = precision_recall_curve( y_test, pred_proba_c1)
    
    # X축을 threshold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary],label='recall')
    
    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1),2))
    
    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend(); plt.grid()
    plt.show()
    
#평가 지표
def metrics(y_test,pred):
    accuracy = round(accuracy_score(y_test,pred), 3)
    precision = round(precision_score(y_test,pred), 3)
    recall = round(recall_score(y_test,pred), 3)
    f1 = round(f1_score(y_test,pred), 3)
    roc_score = round(roc_auc_score(y_test,pred,average='macro'), 3)
    print('정확도 : {0:.3f}, 정밀도 : {1:.3f}, 재현율 : {2:.3f}'.format(accuracy,precision,recall))
    print('f1-score : {0:.3f}, auc : {1:.3f}'.format(f1,roc_score,recall))
    return precision, recall, f1

# BaysianOptimization
def lgb_roc_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight):
    
    params = {
        "n_estimator":200,
        "learning_rate":0.02,
        'num_leaves': int(round(num_leaves)),
        'colsample_bytree': colsample_bytree, 
        'subsample': subsample,
        'max_depth': int(round(max_depth)),
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda, 
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'verbosity': -1
    }
    print("params:", params)
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=30, eval_metric="auc", verbose=100 )
    best_iter = lgb_model.best_iteration_
    print('best_iter:', best_iter)
    valid_proba = lgb_model.predict_proba(X_valid, num_iteration=best_iter)[:, 1]
    roc_preds = roc_auc_score(y_valid, valid_proba)
    print('roc_auc:', roc_preds)
    return roc_preds

## LightGBM

In [16]:
# 전체 데이터 중 80%는 학습용 데이터, 20%는 테스트용 데이터 추출
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 91)

In [17]:
lgbm_clf = LGBMClassifier(boosting='gbdt', num_boost_round = 2000, learning_rate = 0.01
                          , max_depth=16, metric='binary', min_child_samples=80
                          , n_estimators=500, objective='binary', random_state=91
                          , early_stopping_rounds=500, subsample=0.8, verbose=400
                         , max_bin = 90, reg_lambda = 1, reg_alpha = 1)
                          #, reg_lambda = 0.1, reg_alpha = 0.05)

# LightGBM도 XGBoost와 동일하게 조기 중단 수행 가능. 
evals = [(X_train, y_train), (X_valid, y_valid)]

lgbm_clf.fit(X_train, y_train, early_stopping_rounds=300, eval_metric="f1", 
                 eval_set=evals, verbose=True)
preds = lgbm_clf.predict(X_valid)
precision, recall, f1 = metrics(y_valid, preds)
print(confusion_matrix(preds, y_valid))

[LightGBM] [Info] Number of positive: 5067, number of negative: 15772
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.910924
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.290702
[LightGBM] [Debug] init for col-wise cost 0.000567 seconds, init for row-wise cost 0.001500 seconds
[LightGBM] [Debug] col-wise cost 0.000113 seconds, row-wise cost 0.000064 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 505
[LightGBM] [Info] Number of data points in the train set: 20839, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243150 -> initscore=-1.135487
[LightGBM] [Info] Start training from score -1.135487
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[1]	training's binary_logloss: 0.550272	valid_1's binary_logloss: 0.543961
Training until val

# 노트북

In [18]:
NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = X.columns
splits = folds.split(X, y)
y_preds = np.zeros(test.shape[0])

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

In [19]:
model = LGBMClassifier(objective='binary', verbose=400, random_state=91)


for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold: ', fold_n+1)
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    evals = [(X_train, y_train), (X_valid, y_valid)]
    model.fit(X_train, y_train, eval_metric='f1', eval_set=evals, verbose=True)
    
    feature_importances[f'fold_{fold_n + 1}'] = model.feature_importances_
        
    y_preds += model.predict(test).astype(int) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()

Fold:  1
[LightGBM] [Info] Number of positive: 5057, number of negative: 15782
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.909881
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.290318
[LightGBM] [Debug] init for col-wise cost 0.000770 seconds, init for row-wise cost 0.001311 seconds
[LightGBM] [Debug] col-wise cost 0.000215 seconds, row-wise cost 0.000103 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 702
[LightGBM] [Info] Number of data points in the train set: 20839, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242670 -> initscore=-1.138097
[LightGBM] [Info] Start training from score -1.138097
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[1]	training's binary_logloss: 0.51029	valid_1's binary_logloss: 0.507521
[LightGBM]

In [21]:
sample_submission['prediction'] = y_preds

for ix, row in sample_submission.iterrows():
    if row['prediction'] > 0.5:
        sample_submission.loc[ix, 'prediction'] = 1
    else:
        sample_submission.loc[ix, 'prediction'] = 0

sample_submission = sample_submission.astype({"prediction": int})
sample_submission.to_csv('workportion+WhiteMale+AgeGroup.csv', index=False)

In [28]:
!kaggle competitions submit -c kakr-4th-competition -f ./submission/workportion+WhiteMale+AgeGroup.csv -m "workportion+WhiteMale+AgeGroup"

100%|██████████████████████████████████████| 43.4k/43.4k [00:07<00:00, 6.30kB/s]
Successfully submitted to [T-Academy X KaKr] 성인 인구조사 소득 예측 대회 

In [31]:
train.shape[0] + test.shape[0]

32561

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc


feature_importance = lgbm_clf.feature_importances_

# plot
df_fi = pd.DataFrame({'columns':X.columns, 'importances':feature_importance})
df_fi = df_fi[df_fi['importances'] > 0] # importance가 0이상인 것만 
df_fi = df_fi.sort_values(by=['importances'], ascending=False)

fig = plt.figure(figsize=(20,8))
ax = sns.barplot(df_fi['columns'], df_fi['importances'])
ax.set_xticklabels(df_fi['columns'], rotation=80, fontsize=13)
plt.tight_layout()
plt.show()

### 제출기록 데이터프레임

In [None]:
table = pd.DataFrame

In [None]:
time = str(datetime.now()).split('.')[0]
table = table.append(pd.DataFrame({'Time' : ['{}'.format(time)]
                      ,'Precision' : [precision]
                      ,'Recall' : [recall]
                      ,'F1' : [f1]}))
table = table.reset_index(drop = True)
table

# Hyperparameter Tuning

## GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'boosting':['gbdt'],
          'max_depth':[32, 64, 128],
          'min_child_samples':[60, 100],
          'subsample':[0.8, 1],
          'learning_rate':[0.05, 0.1],
          'metric':['binary'],
          
          }


# 하이퍼 파라미터 테스트의 수행속도를 향상 시키기 위해 cv를 지정하지 않습니다. 
gridcv = GridSearchCV(lgbm_clf, param_grid=params, verbose=3, n_jobs=-1, cv=5)

In [None]:
gridcv.fit(X_train, y_train, early_stopping_rounds=200, eval_metric="f1",
           eval_set=[(X_train, y_train), (X_valid, y_valid)])

In [None]:
lgbm_clf_best = gridcv.best_estimator_
print(gridcv.best_estimator_)

In [None]:
lgbm_clf_best.fit(X, y)

### GridSearchCV - Best Parameter
- 라벨 인코딩
- workportion 추가
- LGBMClassifier(boosting='gbdt', max_depth=32, metric='binary',
               min_child_samples=60, n_estimators=500, objective='binary',
               random_state=91, subsample=0.8, verbose=0)

In [None]:
##
pred_proba = lgbm_clf_best.predict_proba(X_valid)
pred  = lgbm_clf_best.predict_proba(X_valid)
print('pred_proba()결과 Shape : {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출 \n:', pred_proba[:3])

# 예측 확률 array 와 예측 결과값 array 를 concatenate 하여 예측 확률과 결과값을 한눈에 확인
pred_proba_result = np.concatenate([pred_proba , pred.reshape(-1,2)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n',pred_proba_result[:3])


In [None]:
pred_proba = lgbm_clf_best.predict_proba(test)
pred  = lgbm_clf_best.predict_proba(test)
print('pred_proba()결과 Shape : {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출 \n:', pred_proba[:3])

# 예측 확률 array 와 예측 결과값 array 를 concatenate 하여 예측 확률과 결과값을 한눈에 확인
pred_proba_result = np.concatenate([pred_proba , pred.reshape(-1,2)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n',pred_proba_result[:3])



In [None]:
from sklearn.preprocessing import Binarizer

#Binarizer의 threshold 설정값. 분류 결정 임곗값임.  
custom_threshold = 0.515#Find_threshold(lgbm_clf_best, X, y)[0]

# predict_proba( ) 반환값의 두번째 컬럼 , 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)

#get_clf_eval(y_valid, custom_predict)

In [None]:
custom_predict.sum()

In [None]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import Binarizer

# predict_proba( ) 반환값의 두번째 컬럼 , 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)
sample_submission['prediction'] = custom_predict
sample_submission['prediction'] = sample_submission['prediction'].astype('int')

In [None]:
sample_submission['prediction'].sum()

In [None]:
sample_submission.to_csv('./submission/nation_lgbm.csv', index = False)

!kaggle competitions submit -c kakr-4th-competition -f ./submission/insert_question_thresh0.5_.csv -m "?채우고(모델링), 원핫인코딩, threshold 0.5"

## RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = {'n_estimators': [500],
               'max_depth': [16, 32, 64, 128, 256, 512],
               'learning_rate':np.arange(0.001,0.02,0.001),
               'max_features': ['auto', 'sqrt'],
               'bootstrap': [True, False]}

# {'bootstrap': [True, False],
#  'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
#  'max_features': ['auto', 'sqrt'],
#  'min_samples_leaf': [1, 2, 4],
#  'min_samples_split': [2, 5, 10],
#  'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune

lg_cl = LGBMClassifier(objective='binary', verbose=400, random_state=91)

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores

lg_random = RandomizedSearchCV(estimator = lg_cl, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, n_jobs = -1)

# Fit the random search model

lg_random.fit(X, y)
print(lg_random.best_estimator_)

In [None]:
lgb_random_best = lg_random.best_estimator_

In [None]:
lgb_random_best.fit(X, y)

In [None]:
## 1 ##

pred_proba = lgb_random_best.predict_proba(X_valid)
pred  = lgb_random_best.predict_proba(X_valid)
print('pred_proba()결과 Shape : {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출 \n:', pred_proba[:3])

# 예측 확률 array 와 예측 결과값 array 를 concatenate 하여 예측 확률과 결과값을 한눈에 확인
pred_proba_result = np.concatenate([pred_proba , pred.reshape(-1,2)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n',pred_proba_result[:3])



In [None]:
## 2

from sklearn.preprocessing import Binarizer

#Binarizer의 threshold 설정값. 분류 결정 임곗값임.  
custom_threshold = 0.5

# predict_proba( ) 반환값의 두번째 컬럼 , 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_valid, custom_predict)

In [None]:
from sklearn.metrics import precision_recall_curve

# 레이블 값이 1일때의 예측 확률을 추출 
pred_proba_class1 = lgb_random_best.predict_proba(X_valid)[:, 1] ###

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
    
precision_recall_curve_plot(y_valid, lgb_random_best.predict_proba(X_valid)[:, 1]) ##
Find_threshold(lgb_random_best, X_valid, y_valid)

In [None]:
# Binarizer의 베스트 threshold 설정값 지정 

custom_threshold = Find_threshold(lgb_random_best, X_valid, y_valid)[0]
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_valid , custom_predict)

## BaysianOptimization

In [1]:
bayes_params = {
    'num_leaves': (24, 45),
    'colsample_bytree':(0.5, 1), 
    'subsample': (0.5, 1),
    'max_depth': (4, 12),
    'reg_alpha': (0, 0.5),
    'reg_lambda': (0, 0.5), 
    'min_split_gain': (0.001, 0.1),
    'min_child_weight':(5, 50)
}

In [2]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

def lgb_roc_eval(num_leaves, colsample_bytree, subsample, max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight):
    
    params = {
        "n_estimator":200,
        "learning_rate":0.02,
        'num_leaves': int(round(num_leaves)),
        'colsample_bytree': colsample_bytree, 
        'subsample': subsample,
        'max_depth': int(round(max_depth)),
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda, 
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
        'verbosity': -1
    }
    print("params:", params)
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=30, eval_metric="auc", verbose=100 )
    best_iter = lgb_model.best_iteration_
    print('best_iter:', best_iter)
    valid_proba = lgb_model.predict_proba(X_valid, num_iteration=best_iter)[:, 1]
    roc_preds = roc_auc_score(y_valid, valid_proba)
    print('roc_auc:', roc_preds)
    return roc_preds

    

In [3]:
from bayes_opt import BayesianOptimization

BO_lgb = BayesianOptimization(lgb_roc_eval, bayes_params, random_state=0)


In [5]:
BO_lgb.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
params: {'n_estimator': 200, 'learning_rate': 0.02, 'num_leaves': 33, 'colsample_bytree': 0.7744067519636624, 'subsample': 0.9458865003910399, 'max_depth': 10, 'reg_alpha': 0.32294705653332806, 'reg_lambda': 0.21879360563134626, 'min_split_gain': 0.05494343511669279, 'min_child_weight': 32.12435192322397, 'verbosity': -1}


NameError: name 'X_train' is not defined

In [None]:
BO_lgb.max

In [None]:
max_params = BO_lgb.max['params']

max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['max_depth'] = int(round(max_params['max_depth']))

lgbm_clf_bayes_best = LGBMClassifier(n_estimators=1000, learning_rate=0.02, **max_params)

evals = [(X_valid, y_valid)]
lgbm_clf_bayes_best.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals,
                verbose=True)

lgbm_roc_score = roc_auc_score(y_valid, lgbm_clf_bayes_best.predict_proba(X_valid)[:,1],average='macro')
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

In [None]:
lgbm_clf_bayes_best

In [None]:
lgbm_clf_bayes_best.fit(X, y)

In [None]:
## 1 ##

pred_proba = lgbm_clf_bayes_best.predict_proba(X_valid) ##
pred  = lgbm_clf_best.predict_proba(X_valid) ##
print('pred_proba()결과 Shape : {0}'.format(pred_proba.shape))
print('pred_proba array에서 앞 3개만 샘플로 추출 \n:', pred_proba[:3])

# 예측 확률 array 와 예측 결과값 array 를 concatenate 하여 예측 확률과 결과값을 한눈에 확인
pred_proba_result = np.concatenate([pred_proba , pred.reshape(-1,2)],axis=1)
print('두개의 class 중에서 더 큰 확률을 클래스 값으로 예측 \n',pred_proba_result[:3])


In [None]:
## 2

from sklearn.preprocessing import Binarizer

#Binarizer의 threshold 설정값. 분류 결정 임곗값임.  
custom_threshold = 0.5

# predict_proba( ) 반환값의 두번째 컬럼 , 즉 Positive 클래스 컬럼 하나만 추출하여 Binarizer를 적용
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_valid, custom_predict)

In [None]:
from sklearn.metrics import precision_recall_curve

# 레이블 값이 1일때의 예측 확률을 추출 
pred_proba_class1 = lgbm_clf_bayes_best.predict_proba(X_valid)[:, 1] ###


In [None]:
precision_recall_curve_plot(y_valid, lgbm_clf_best.predict_proba(X_valid)[:, 1] ) ####
Find_threshold(lgbm_clf_bayes_best, X_valid, y_valid)

In [None]:
# Binarizer의 베스트 threshold 설정값 지정 

custom_threshold = Find_threshold(lgbm_clf_bayes_best, X_valid, y_valid)[0]
pred_proba_1 = pred_proba[:,1].reshape(-1,1)
binarizer = Binarizer(threshold=custom_threshold).fit(pred_proba_1) 
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_valid , custom_predict)

# 제출

In [None]:
sample_submission.to_csv('./submission/유인덱스.csv', index = False)

kaggle competitions submit -c kakr-4th-competition -f submission.csv -m "Message"