In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost
!pip install optuna



In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, mean_squared_error
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
import warnings

warnings.filterwarnings(action='ignore')
dir = '/content/drive/MyDrive/recommend/KNOW_data/'
know_train = [pd.read_csv(path) for path in sorted(glob(dir + 'train_new/*.csv'))]
SEED = 42

In [None]:
# train = pd.concat([know_train[0][['knowcode']], know_train[1][['knowcode']]])
# train = pd.concat([train, know_train[2][['knowcode']]])
# train = pd.concat([train, know_train[3][['knowcode']]])

In [None]:
# knowcode_list = sorted(train['knowcode'])
# knowcode_list = set(knowcode_list)
# knowcode_list = list(knowcode_list)
# knowcode_list = sorted(knowcode_list)

# 전처리 

데이터의 빈 셀에 None값이 아닌 ' '처럼 공백으로 들어가있기 때문에 is_null등의 함수로 결측치를 찾아낼 수 없습니다. 

공백이 있는 컬럼은 '0'으로 대체하였습니다.

In [None]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)
    # df['knowcode'] = df.knowcode.map(lambda x: knowcode_list.index(x))

## 라벨 인코딩

숫자로 변환할 수 있는 컬럼은 라벨 인코딩을 사용하지 않았습니다.

string이나 object컬럼은 라벨인코더를 이용해 변환하였으며 추후 test셋에 사용해야하기 때문에 년도별, 컬럼별로 dictionary를 이용해 저장하였습니다

In [None]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [None]:
know_test = [pd.read_csv(path) for path in sorted(glob(dir + 'test_new/*.csv'))]

In [None]:
for df in know_test:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [None]:
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) 

# X, y 구분 및 모델 학습

이번 대회에서 맞춰야 할 값은 knowcode입니다.

ID와 knowcode를 제외한 나머지 feature를 X, knowcode를 정답 y로 두어 모델을 학습하였습니다.

베이스라인에서는 의사결정나무와 랜덤포레스트를 선정하였습니다

In [None]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [None]:
test_data = {}
for year, df in zip(years, know_test):
    test_data[year] =  {'X': df.iloc[:,1:]}

In [None]:
cat_features = train_data['2017']['X'].iloc[:, :].columns[train_data['2017']['X'].iloc[:, :].nunique() > 2].tolist()

In [None]:
# OPTUNA_OPTIMIZATION = True

# def objective(trial):
#     cat_features = train_data['2017']['X'].iloc[:, :].columns[train_data['2017']['X'].iloc[:, :].nunique() > 2].tolist()
#     X_train, X_test, y_train, y_test = train_test_split(train_data['2017']['X'].iloc[:, :], train_data['2017']['y'], test_size=0.2, shuffle=True, random_state=1)
    
#     params = {
#         'iterations':trial.suggest_int("iterations", 1000, 20000),
#         'objective': trial.suggest_categorical('objective', ['MultiClass']),
#         'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
#         'od_wait':trial.suggest_int('od_wait', 500, 2000),
#         'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
#         'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
#         'random_strength': trial.suggest_uniform('random_strength',10,50),
#         'depth': trial.suggest_int('depth',1,15),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
#         'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
#         'verbose': False,
#         'task_type' : 'GPU',
#         'devices' : '0',
#         'cat_features':cat_features,
#         "one_hot_max_size":trial.suggest_int('one_hot_max_size',1,15),
#         "eval_metric":"TotalF1",
#     }
    
#     if params['bootstrap_type'] == 'Bayesian':
#         params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
#     elif params['bootstrap_type'] == 'Bernoulli':
#         params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
#     model = CatBoostClassifier(**params)
#     model.fit(
#         X_train, y_train,
#         eval_set=[(X_test,y_test)],
#         early_stopping_rounds=100,
#         use_best_model=True
#     )
    
#     # validation prediction
#     pred = model.predict(X_test)
#     score = f1_score(y_test, pred)
    
#     return score

In [None]:
# study = optuna.create_study(
#     direction='maximize',
#     study_name='CatbClf'
# )

# study.optimize(
#     objective, 
#     n_trials=100
# )

In [None]:
# cat_models = {}

# for year in tqdm(years):
#     cat_feature = train_data[year]['X'].iloc[:, :].columns[train_data[year]['X'].iloc[:, :].nunique() > 2].tolist()
#     model = CatBoostClassifier(random_state=42, cat_features=cat_feature)
#     # model = CatBoostClassifier(random_state=42)
#     model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'], verbose=100,early_stopping_rounds=10)
#     cat_models[year] = model

In [None]:
cat_models = {}

for year in tqdm(years):
    cat_feature = train_data[year]['X'].iloc[:, :].columns[train_data[year]['X'].iloc[:, :].nunique() > 2].tolist()
    # model = CatBoostClassifier(random_state=42, cat_features=cat_feature)
    model = CatBoostClassifier(random_state=42)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'], verbose=100,early_stopping_rounds=10)
    cat_models[year] = model

In [None]:
cat_predicts = [] 

for year in tqdm(years):
    pred = cat_models[year].predict(test_data[year]['X'])
    cat_predicts.extend(pred)

In [None]:
submission = pd.read_csv(dir + 'sample_submission.csv') # sample submission 불러오기

In [None]:
submission['knowcode'] = cat_predicts
for i in range(len(submission)):
    # submission['knowcode'][i] = submission['knowcode'][i][1:-1]
    submission['knowcode'][i] = submission['knowcode'][i][0]
submission.to_csv(dir + 'submission_catboost.csv', index=False)

In [None]:
cat_predicts[0][0]

In [None]:
submission

In [None]:
# kf = KFold(n_splits=5,random_state=42,shuffle=True)
# rmse = []
# preds = np.zeros(test_data['2017']['X'].shape[0])
# n = 0
# for 
# for trn_idx, test_idx in kf.split(train_data['2017']['X'],train_data['2017']['y']):
#     X_tr,X_val=train_data['2017']['X'].iloc[trn_idx],train_data['2017']['X'].iloc[test_idx]
#     y_tr,y_val=train_data['2017']['y'].iloc[trn_idx],train_data['2017']['y'].iloc[test_idx]
#     model = CatBoostClassifier(random_state=42)
#     model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
#     preds += model.predict_proba(test_data['2017']['X'])/kf.n_splits
#     rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
#     print(n+1, rmse[n])
#     n += 1

# 제출

In [None]:
# catboost_predicts = []
# for year in tqdm(years):
#     catboost_predicts.append(locals()['preds_{}'.format(year)])

In [None]:
# submission = pd.read_csv(dir + 'sample_submission.csv') # sample submission 불러오기

In [None]:
# submission['knowcode'] = catboost_predicts

# submission.to_csv(dir + 'submission_catboost.csv', index=False)