In [8]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [9]:
train = pd.read_csv('C:/Users/tmdwh/Desktop/SEUNGJO/dacon/competition_data/train.csv')
test = pd.read_csv('C:/Users/tmdwh/Desktop/SEUNGJO/dacon/competition_data/test.csv')
submission = pd.read_csv('C:/Users/tmdwh/Desktop/SEUNGJO/dacon/competition_data/submission.csv', index_col = 0)

In [10]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

In [11]:
drop_list = ['introelapse', 'testelapse', 'surveyelapse', 'index']

train.drop(drop_list, axis = 1, inplace = True)
test.drop(drop_list, axis = 1, inplace = True)

In [12]:
# fillna(0) -> 0으로 채운 값들을 평균 값으로 처리
processing_feature = ['gender', 'married', 'education', 'voted', 'urban', 'orientation', 'religion', 'engnat', 'ASD']

for pro in processing_feature:
    train.loc[train[pro] == 0, pro] = train[pro].mean()
    test.loc[test[pro] == 0, pro] = train[pro].mean()

In [13]:
#age 이상치(80살 이상) 0으로 처리
train.loc[train['age'] > 80, 'age'] = 0
test.loc[test['age'] > 80, 'age'] = 0

In [14]:
#age의 이상치는 train데이터의 평균값으로 처리
train.loc[train['age'] == 0, 'age'] = train['age'].mean()
test.loc[test['age'] == 0, 'age'] = train['age'].mean()

#familysize 6 이상인 값 최빈값으로 교체, 
train.loc[train['familysize'] > 6, 'familysize'] = train['familysize'].mode()[0]
test.loc[test['familysize'] > 6, 'familysize'] = train['familysize'].mode()[0]

In [15]:
#훈련 데이터에 존재하는 상위 6개의 나라
best6_country = train['country'].value_counts().keys()[:6]

#train데이터 상위 6개 국가 제외 train데이터의 최빈값으로 처리
for i in range(len(train['country'])):
    if train['country'][i] not in best6_country or train['country'][i]==0:
        train['country'][i] = train['country'].mode()[0]

#test데이터 상위 6개 국가 제외 train데이터의 최빈값으로 처리
for i in range(len(test['country'])):
    if test['country'][i] not in best6_country or test['country'][i]==0:
        test['country'][i] = train['country'].mode()[0]

In [16]:
#country 특성 원-핫인코딩 수행 (country 특성 세부 분할)
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [18]:
train.columns


Index(['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness', 'country_AUS',
       'country_CAN', 'country_DEU', 'country_GBR', 'country_PHL',
       'country_USA'],
      dtype='object')

In [19]:
train.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,married,familysize,ASD,nerdiness,country_AUS,country_CAN,country_DEU,country_GBR,country_PHL,country_USA
0,1.0,5.0,5.0,5.0,1.0,4.0,5.0,5.0,1.0,3.0,...,1.0,4.0,2.0,1,0,0,0,0,0,1
1,4.0,4.0,4.0,4.0,4.0,5.0,4.0,4.0,3.0,3.0,...,2.0,4.0,2.0,1,0,0,0,0,0,1
2,4.0,5.0,5.0,4.0,3.0,5.0,5.0,5.0,4.0,4.0,...,3.0,4.0,2.0,1,0,0,0,0,0,1
3,4.0,4.0,4.0,2.0,4.0,3.0,3.0,5.0,3.0,4.0,...,1.0,2.0,2.0,1,0,0,0,0,0,1
4,4.0,4.0,4.0,4.0,3.0,3.0,4.0,2.0,3.0,4.0,...,1.0,1.0,2.0,0,0,0,0,0,0,1


In [20]:
!pip install --quiet optuna

In [21]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner

In [22]:
features = ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11',
       'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21',
       'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'country_AUS',
       'country_CAN', 'country_DEU', 'country_GBR', 'country_PHL',
       'country_USA']

target = ['nerdiness']

In [23]:
df_trains = []
df_valids = []

skf = StratifiedKFold(n_splits=10, random_state=2022, shuffle=True)
for train_index, valid_index in skf.split(train[features], train[target]):
    df_train = train.loc[train_index]
    df_valid = train.loc[valid_index]
    df_trains.append(df_train)
    df_valids.append(df_valid)

    x_train = df_train[features]
    y_train = df_train[target]

    x_test = df_valid[features]
    y_test = df_valid[target]

In [24]:
def accuracy(true, pred):
    return np.mean(true==pred)

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 300, 824, step=1, log=True), 
        'max_depth': trial.suggest_int('max_depth', 10, 20, step=1, log=False), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True), 
        'n_estimators': trial.suggest_int('n_estimators', 1500, 3000, step=1, log=True), 
        "metric": "multi_auc",
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0),
        'random_state': 2022
    }
    
    score = []
    for df_train, df_valid in zip(df_trains, df_valids):
        clf = LGBMClassifier(**params)
        clf.fit(x_train, y_train)
        
        pred = clf.predict_proba(df_valid[features])[:, 1]
        true = df_valid[target].values
        score.append(roc_auc_score(true, pred))
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=2022), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=10)

[32m[I 2023-01-12 14:19:50,515][0m Trial 0 finished with value: 0.8652460636910808 and parameters: {'num_leaves': 302, 'max_depth': 10, 'learning_rate': 0.00021885227715682474, 'n_estimators': 1552, 'class_weight': 'balanced', 'min_child_samples': 28, 'subsample': 0.8460964204647191, 'colsample_bytree': 0.9692971679329809, 'reg_alpha': 0.6474520707432663, 'reg_lambda': 0.8969631227909967}. Best is trial 0 with value: 0.8652460636910808.[0m
[32m[I 2023-01-12 14:31:46,048][0m Trial 1 finished with value: 0.9639329641348688 and parameters: {'num_leaves': 622, 'max_depth': 18, 'learning_rate': 0.0008032016432591671, 'n_estimators': 2924, 'class_weight': 'balanced', 'min_child_samples': 24, 'subsample': 0.7983793371201554, 'colsample_bytree': 0.7413546193275696, 'reg_alpha': 0.41210821523896957, 'reg_lambda': 0.3136067660110393}. Best is trial 1 with value: 0.9639329641348688.[0m


KeyboardInterrupt: 

In [None]:
optuna.visualization.plot_optimization_history(study)


In [None]:
optuna.visualization.plot_parallel_coordinate(study)


In [None]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
clfs = []
for df_train in df_trains:
    clf = LGBMClassifier(**study.best_params)
    clf.fit(df_train[features], df_train[target])
    pred_clf = clf.predict_proba(x_test)[:, 1]
    print(roc_auc_score(y_test, pred_clf))
    clfs.append(clf)

In [None]:
clfs

In [None]:
pred = [clif.predict_proba(test[features]) for clf in clfs]

In [None]:
real_pred = []

for i in range(10):
    for j in range(len(pred[i])):
        real_pred.append(pred[i][j][1])

In [None]:
real_pred = []

for i in range(1):
    for j in range(len(pred[i])):
        real_pred.append(pred[i][j][1])

for i in range(2,10):
    for j in range(len(pred[i])):
        real_pred[j] += pred[i][j][1]
        if i==9:
            real_pred[j]= real_pred[j]/10

real_pred[:10]


In [None]:
x_train = train.drop('nerdiness', axis = 1)
y_train = train['nerdiness']

In [None]:
#extree model

extraTree_model = ExtraTreesClassifier(n_jobs=-1, random_state = 2022, n_estimators=3000)
extraTree_model.fit(x_train, y_train)
extraTree_pred = extraTree_model.predict_proba(test)[:, 1]

In [None]:
real2_pred = list(extraTree_pred)


In [None]:
result_pred = []

for i in range(len(real2_pred)):
    result_pred.append((real2_pred[i] * 0.6) + (real_pred[i] * 0.4))

result_pred

In [None]:
submission['nerdiness'] = result_pred


In [None]:
submission

In [None]:
submission.to_csv('/gdrive/My Drive/LGBM(optuna, 10FOLD)_EXTRATR.csv')