<a href="https://colab.research.google.com/github/dsjoh/git/blob/main/GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [None]:
import os
import pandas as pd
import numpy as np

import random as python_random 

import matplotlib.pyplot as plt
%matplotlib inline


# install #
!pip install --upgrade scikit-learn
!pip install catboost
!pip install category_encoders

# Version

In [3]:
import sys
import sklearn
import numpy
import pandas
import catboost

# Parameter

In [4]:
########################## 여기서 경로수정(파일별 직접 경로 설정해주세요) #############################################

PAR = {
    'SEED' : 42,
    'FOLD' : 6,
    'EARLY_STOP' : 1000,
    'FILEPATH' : '/content/drive/MyDrive/LG Comp/LG_data.zip', # zip파일 데이터셋 경로
    'TRAIN_DIRECTPATH' : '', # train.csv 데이터셋 직접 경로  './data/train.csv'
    'TEST_DIRECTPATH' : '', # test.csv 데이터셋 직접 경로  './data/test.csv'
    'SUB_DIRECTPATH' : '', # sample_submission.csv 데이터셋 직접 경로  './data/sample_submission.csv'
    'OUTPUT_PATH' : './' # 출력물이 저장되는 경로  './data/'
}

#SEED

In [5]:
# 시드 고정
seed_num = PAR['SEED']
np.random.seed(seed_num)
python_random.seed(seed_num)

# Dataset

In [None]:
# CSV 직접 경로 설정 시
if PAR['TRAIN_DIRECTPATH'] and PAR['TEST_DIRECTPATH'] and PAR['SUB_DIRECTPATH'] != '' :
  train = pd.read_csv(PAR['TRAIN_DIRECTPATH'])
  test = pd.read_csv(PAR['TEST_DIRECTPATH'])
  sub = pd.read_csv(PAR['SUB_DIRECTPATH'])

# Colab 환경에서 ZIP 경로만 설정 시
elif PAR['FILEPATH'].split('/')[0]=='content' : 
  train = pd.read_csv('/content/train.csv')
  test = pd.read_csv('/content/test.csv')
  sub = pd.read_csv('/content/sample_submission.csv')

# 이외 ./ 경로
else :
  train = pd.read_csv('./train.csv')
  test = pd.read_csv('./test.csv')
  sub = pd.read_csv('./sample_submission.csv')

#Preprocessing

In [None]:
import category_encoders as ce

train2 = train.copy()
test2 = test.copy()

# TIMESTAMP 전처리
train2['TIMESTAMP'] = pd.to_datetime(train2['TIMESTAMP'])
train2['month'] = train2['TIMESTAMP'].dt.month # 월
train2['day'] = train2['TIMESTAMP'].dt.day # 일
train2['hour'] = train2['TIMESTAMP'].dt.hour # 시간
train2['minute'] = train2['TIMESTAMP'].dt.minute # 분

test2['TIMESTAMP'] = pd.to_datetime(test2['TIMESTAMP'])
test2['month'] = test2['TIMESTAMP'].dt.month # 월
test2['day'] = test2['TIMESTAMP'].dt.day # 일
test2['hour'] = test2['TIMESTAMP'].dt.hour # 시간
test2['minute'] = test2['TIMESTAMP'].dt.minute # 분

# Label
label_df = train['Y_Class']


# cat-encoding
train_cat = pd.concat([train['PRODUCT_CODE'], train['LINE']], axis=1)
test_cat = pd.concat([test['PRODUCT_CODE'], test['LINE']], axis=1)

ce1 = ce.CatBoostEncoder()
ce1.fit(train_cat['PRODUCT_CODE'],label_df)
train_cat['PRODUCT_CODE'] = ce1.transform(train_cat['PRODUCT_CODE'])
test_cat['PRODUCT_CODE'] = ce1.transform(test_cat['PRODUCT_CODE'])

ce2 = ce.CatBoostEncoder()
ce2.fit(train_cat['LINE'],label_df)
train_cat['LINE'] = ce2.transform(train_cat['LINE'])
test_cat['LINE'] = ce2.transform(test_cat['LINE'])


# 학습에 불필요한 column drop
train2 = train2.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality','LINE','PRODUCT_CODE'])
test2 = test2.drop(columns=['PRODUCT_ID','TIMESTAMP','LINE','PRODUCT_CODE'])


train2 = pd.concat([train2,train_cat],axis=1)
test2 = pd.concat([test2,test_cat],axis=1)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from catboost import CatBoostClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from tqdm import tqdm
from sklearn.feature_selection import RFECV

skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

cbc = CatBoostClassifier(verbose = 1000, eval_metric='TotalF1:average=Macro',task_type='GPU')
rfecv=RFECV(estimator=cbc,step=1,cv=skf,scoring='f1_macro',verbose=5)
rfecv.fit(train2,label_df)

#to get number of the features
display('Number of features:', rfecv.n_features_)

#to get feature names
col_list = list(train2.columns[rfecv.support_]) 
col_list

plt.title("RFECV ")
plt.xlabel("Number Of Features")
plt.ylabel("f1-macro")
plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
plt.show()

df1 = pd.DataFrame(col_list)
df1.columns = ['col_list']
df1.to_csv("./cat_list.csv", index=False, header=False)

df2 = pd.DataFrame(rfecv.ranking_)
df2.columns = ['col_ranking']
df2.to_csv("./col_ranking.csv", index=False, header=False)