## import & version

In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [4]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import catboost
from catboost import *
import sklearn
from sklearn.preprocessing import LabelEncoder
import sys

In [5]:
print(' pandas version   : ',pd.__version__,'\n',
       'numpy version    : ',np.__version__,'\n',
       'catboost version : ',catboost.__version__,'\n',
       'sklearn version  : ',sklearn.__version__,'\n',
       'python version   : ',sys.version)

 pandas version   :  1.3.5 
 numpy version    :  1.22.4 
 catboost version :  1.1.1 
 sklearn version  :  1.2.1 
 python version   :  3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]


## Feature Engineering
#### 함수 정의

In [6]:
def del_columns(train, test):
    
    '''
    모든 값이 결측값이거나,
    유니크값이 1개인 경우 제거
    '''
    
    col_list = train.columns
    nan_list = []
    nan_cnt = []
    nan_col = []
    full_list = []
    for col in col_list:
        if train[col].isnull().sum() == 0 :
            full_list.append(col)
            continue
        nan_list.append([col, train[col].isnull().sum()])
        nan_cnt.append(train[col].isnull().sum())
        nan_col.append(col)

    del_col = []
    for col in nan_list :
        if col[1] == len(train) :
            del_col.append(col[0])
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)

    del_col = []
    col_list = train.describe().columns
    for col in col_list :
        if col == 'Y_Class':
            continue
        if col == 'Y_Quality':
            continue
        if col == 'LINE':
            continue
        if col == 'PRODUCT_CODE':
            continue
        if train[col].nunique()==1 :
            del_col.append(col)
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)
    
    return train,test
   
def make_train_test_dataset(train,test):
    
    '''
    트레인데이터, 학습데이터 셋 만들기
    '''
    
    train_x = train.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
    test_x = test.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
    train_y = train['Y_Quality']
    train_w = train[['Y_Class']]
    return train_x, test_x, train_y, train_w

def fillna(train,test,value):
    
    '''
    입력받은 value로 결측치 대체
    '''
    
    train = train.fillna(value)
    test = test.fillna(value)
    return train,test

def labelencoder(train,test,col_list):
    
    '''
    입력받은 피처에 대해 라벨인코딩 진행
    '''
    
    qual_col = col_list
    for i in qual_col:
        le = LabelEncoder()
        le = le.fit(train[i])
        train[i] = le.transform(train[i])

        for label in np.unique(test[i]): 
            if label not in le.classes_: 
                le.classes_ = np.append(le.classes_, label)
        test[i] = le.transform(test[i]) 
    return train,test

def pred_target(arr):
    for i in range(len(arr)):
        if arr[i] < -0.64421190232267 :
            arr[i] = 0
        elif arr[i] <= -0.6256814053066195 :
            arr[i] = 1
        else : 
            arr[i] = 2
    return arr

def make_dataset (train_path, test_path):

    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    # y quality scaling
    train['Y_Quality'] = train['Y_Quality'].map(lambda x : np.log(x))

    train, test = del_columns(train,test)

    trainA_31 = train[train['PRODUCT_CODE']=='A_31']
    train_T_31 = train[train['PRODUCT_CODE']=='T_31']
    train_O_31 = train[train['PRODUCT_CODE']=='O_31']

    testA_31 = test[test['PRODUCT_CODE']=='A_31']
    test_T_31 = test[test['PRODUCT_CODE']=='T_31']
    test_O_31 = test[test['PRODUCT_CODE']=='O_31']

    trainA_31, testA_31 = del_columns(trainA_31,testA_31)
    train_T_31, test_T_31 = del_columns(train_T_31,test_T_31)
    train_O_31, test_O_31 = del_columns(train_O_31,test_O_31)

    trainA_31_x,testA_31_x, trainA_31_y, trainA_31_w = make_train_test_dataset(trainA_31,testA_31)
    train_T_31_x,test_T_31_x, train_T_31_y, train_T_31_w = make_train_test_dataset(train_T_31,test_T_31)
    train_O_31_x,test_O_31_x, train_O_31_y, train_O_31_w = make_train_test_dataset(train_O_31,test_O_31)

    trainA_31_x,testA_31_x = labelencoder(trainA_31_x,testA_31_x,['LINE'])
    train_T_31_x,test_T_31_x = labelencoder(train_T_31_x,test_T_31_x,['LINE'])
    train_O_31_x,test_O_31_x = labelencoder(train_O_31_x,test_O_31_x,['LINE'])

    trainA_31_x,testA_31_x = fillna(trainA_31_x,testA_31_x,-1)
    train_T_31_x,test_T_31_x = fillna(train_T_31_x,test_T_31_x,-1)
    train_O_31_x,test_O_31_x = fillna(train_O_31_x,test_O_31_x,-1)

    print(" train_a_shape : ",trainA_31_x.shape,testA_31_x.shape,'\n',
          "train_t_shape : ",train_T_31_x.shape,test_T_31_x.shape,'\n',
          "train_o_shape : ",train_O_31_x.shape,test_O_31_x.shape)
    
    return trainA_31_x,testA_31_x, trainA_31_y, testA_31,train_T_31_x,test_T_31_x, train_T_31_y, test_T_31,train_O_31_x,test_O_31_x, train_O_31_y,test_O_31

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### 실행

In [7]:
train_input = '/content/drive/MyDrive/LG_Aimers2/open (7)/train.csv'
test_input = '/content/drive/MyDrive/LG_Aimers2/open (7)/test.csv'

In [8]:
train_A_31_x,test_A_31_x, train_A_31_y,test_A_31, train_T_31_x,test_T_31_x, train_T_31_y,test_T_31, train_O_31_x,test_O_31_x, train_O_31_y,test_O_31 = make_dataset(train_input,test_input)

 train_a_shape :  (249, 1866) (67, 1866) 
 train_t_shape :  (343, 550) (239, 550) 
 train_o_shape :  (6, 499) (4, 499)


## 학습 및 예측

In [9]:
model = CatBoostRegressor(random_state=1234,verbose=500,iterations=1500,learning_rate=0.033)

In [10]:
model.fit(train_A_31_x, train_A_31_y)
pred_a = model.predict(test_A_31_x)

0:	learn: 0.0183320	total: 464ms	remaining: 11m 35s
500:	learn: 0.0012455	total: 1m 7s	remaining: 2m 14s
1000:	learn: 0.0001219	total: 2m 12s	remaining: 1m 5s
1499:	learn: 0.0000126	total: 3m 6s	remaining: 0us


In [11]:
pred_a = pred_target(pred_a)

In [12]:
model.fit(train_T_31_x, train_T_31_y)
pred_t = model.predict(test_T_31_x)

0:	learn: 0.0089112	total: 91.8ms	remaining: 2m 17s
500:	learn: 0.0009494	total: 20.7s	remaining: 41.4s
1000:	learn: 0.0001266	total: 42.7s	remaining: 21.3s
1499:	learn: 0.0000183	total: 1m 2s	remaining: 0us


In [13]:
pred_t = pred_target(pred_t)

In [14]:
model.fit(train_O_31_x, train_O_31_y)
pred_o = model.predict(test_O_31_x)

0:	learn: 0.0059697	total: 5.41ms	remaining: 8.11s
500:	learn: 0.0000906	total: 1.66s	remaining: 3.31s
1000:	learn: 0.0000014	total: 3.22s	remaining: 1.6s
1499:	learn: 0.0000000	total: 5.85s	remaining: 0us


In [15]:
pred_o = pred_target(pred_o)

In [17]:
test_A_31['Y_Class'] = pred_a
test_T_31['Y_Class'] = pred_t
test_O_31['Y_Class'] = pred_o

submita = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submitt = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submito = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')

submita = pd.merge(submita[['PRODUCT_ID']],test_A_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],test_T_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],test_O_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

submit_final = pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID')
submit_final['Y_Class'] = submit_final['Y_Class'].astype('int')
submit_final.to_csv('final_test.csv',index=False)

# After

In [18]:
train = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/train.csv')
test = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/test.csv')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [19]:
col_list = train.columns
nan_list = []
nan_cnt = []
nan_col = []
full_list = []
for col in col_list:
    if train[col].isnull().sum() == 0 :
        full_list.append(col)
        continue
    nan_list.append([col, train[col].isnull().sum()])
    nan_cnt.append(train[col].isnull().sum())
    nan_col.append(col)
    
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_list :
    if col[1] == 598 :
        del_col.append(col[0])
train = train.drop(columns=del_col)
test = test.drop(columns=del_col)



trainA_31 = train[train['PRODUCT_CODE']=='A_31']
train_T_31 = train[train['PRODUCT_CODE']=='T_31']
train_O_31 = train[train['PRODUCT_CODE']=='O_31']

testA_31 = test[test['PRODUCT_CODE']=='A_31']
test_T_31 = test[test['PRODUCT_CODE']=='T_31']
test_O_31 = test[test['PRODUCT_CODE']=='O_31']

col_list = train.columns
nan_listA_31 = []
nan_cntA_31 = []
nan_colA_31 = []
full_listA_31 = []
for col in col_list:
    if trainA_31[col].isnull().sum() == 0 :
        full_listA_31.append(col)
        continue
    nan_listA_31.append([col, trainA_31[col].isnull().sum()])
    nan_cntA_31.append(trainA_31[col].isnull().sum())
    nan_colA_31.append(col)
    
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_listA_31 :
    if col[1] == len(trainA_31) :
        del_col.append(col[0])
trainA_31 = trainA_31.drop(columns=del_col)
testA_31 = testA_31.drop(columns=del_col)

'''값이 1개 존재하면 제거'''
del_col = []
col_list = trainA_31.columns
for col in col_list[6:] :
    if trainA_31[col].nunique()==1 :
        del_col.append(col)
trainA_31 = trainA_31.drop(columns=del_col)
testA_31 = testA_31.drop(columns=del_col)



col_list = train.columns
nan_listO = []
nan_cntO = []
nan_colO = []
full_listO = []
for col in col_list:
    if train_O_31[col].isnull().sum() == 0 :
        full_listO.append(col)
        continue
    nan_listO.append([col, train_O_31[col].isnull().sum()])
    nan_cntO.append(train_O_31[col].isnull().sum())
    nan_colO.append(col)
    
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_listO :
    if col[1] == len(train_O_31) :
        del_col.append(col[0])
train_O_31 = train_O_31.drop(columns=del_col)
test_O_31 = test_O_31.drop(columns=del_col)

'''값이 1개 존재하면 제거'''
del_col = []
col_list = train_O_31.columns
for col in col_list[6:] :
    if train_O_31[col].nunique()==1 :
        del_col.append(col)
train_O_31 = train_O_31.drop(columns=del_col)
test_O_31 = test_O_31.drop(columns=del_col)


col_list = train.columns
nan_listT = []
nan_cntT = []
nan_colT = []
full_listT = []
for col in col_list:
    if train_T_31[col].isnull().sum() == 0 :
        full_listT.append(col)
        continue
    nan_listT.append([col, train_T_31[col].isnull().sum()])
    nan_cntT.append(train_T_31[col].isnull().sum())
    nan_colT.append(col)
    
'''모든값이 결측값이면 제거'''
del_col = []
for col in nan_listT :
    if col[1] == len(train_T_31) :
        del_col.append(col[0])
train_T_31 = train_T_31.drop(columns=del_col)
test_T_31 = test_T_31.drop(columns=del_col)

'''값이 1개 존재하면 제거'''
del_col = []
col_list = train_T_31.columns
for col in col_list[6:] :
    if train_T_31[col].nunique()==1 :
        del_col.append(col)
train_T_31 = train_T_31.drop(columns=del_col)
test_T_31 = test_T_31.drop(columns=del_col)


trainA_31_x = trainA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
testA_31_x = testA_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_T_31_x = train_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','Y_Class','Y_Quality','PRODUCT_CODE'])
test_T_31_x = test_T_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])
train_O_31_x = train_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE','Y_Class','Y_Quality'])
test_O_31_x = test_O_31.drop(columns=['PRODUCT_ID','TIMESTAMP','PRODUCT_CODE'])

# classification
trainA_31_y_c = trainA_31['Y_Class']
train_T_31_y_c = train_T_31['Y_Class']
train_O_31_y_c = train_O_31['Y_Class']

# regression
trainA_31_y_r = trainA_31['Y_Quality']
train_T_31_y_r = train_T_31['Y_Quality']
train_O_31_y_r = train_O_31['Y_Quality']

trainA_31_x=trainA_31_x.fillna(-1)
testA_31_x=testA_31_x.fillna(-1)
train_T_31_x=train_T_31_x.fillna(-1)
test_T_31_x=test_T_31_x.fillna(-1)
train_O_31_x=train_O_31_x.fillna(-1)
test_O_31_x=test_O_31_x.fillna(-1)

# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(trainA_31_x[i])
    trainA_31_x[i] = le.transform(trainA_31_x[i])
    
    for label in np.unique(testA_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    testA_31_x[i] = le.transform(testA_31_x[i]) 

# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_T_31_x[i])
    train_T_31_x[i] = le.transform(train_T_31_x[i])
    
    for label in np.unique(test_T_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_T_31_x[i] = le.transform(test_T_31_x[i]) 


# qualitative to quantitative
qual_col = ['LINE']
for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_O_31_x[i])
    train_O_31_x[i] = le.transform(train_O_31_x[i])
    
    for label in np.unique(test_O_31_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_O_31_x[i] = le.transform(test_O_31_x[i]) 


In [20]:
model = CatBoostClassifier(random_state=1234,verbose=500,iterations=1500,learning_rate=0.033)
model.fit(trainA_31_x, trainA_31_y_c)
pred_a = model.predict_proba(testA_31_x)
model.fit(train_T_31_x, train_T_31_y_c)
pred_t = model.predict_proba(test_T_31_x)
model.fit(train_O_31_x, train_O_31_y_c)
pred_o = model.predict_proba(test_O_31_x)

testA_31['class0'] = pred_a[:, 0]
testA_31['class1'] = pred_a[:, 1]
testA_31['class2'] = pred_a[:, 2]
test_T_31['class0'] = pred_t[:, 0]
test_T_31['class1'] = pred_t[:, 1]
test_T_31['class2'] = pred_t[:, 2]
test_O_31['class0'] = 0
test_O_31['class1'] = pred_o[:, 0]
test_O_31['class2'] = pred_o[:, 1]


submita = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submitt = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submito = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')

submita = pd.merge(submita[['PRODUCT_ID']],testA_31[['PRODUCT_ID','class0','class1','class2']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],test_T_31[['PRODUCT_ID','class0','class1','class2']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],test_O_31[['PRODUCT_ID','class0','class1','class2']],on='PRODUCT_ID')

proba = pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID')

0:	learn: 1.0882409	total: 319ms	remaining: 7m 57s
500:	learn: 0.1579504	total: 2m 20s	remaining: 4m 40s
1000:	learn: 0.0575422	total: 4m 30s	remaining: 2m 15s
1499:	learn: 0.0313583	total: 6m 35s	remaining: 0us
0:	learn: 1.0742571	total: 175ms	remaining: 4m 21s
500:	learn: 0.1306600	total: 49.3s	remaining: 1m 38s
1000:	learn: 0.0516016	total: 1m 36s	remaining: 48.3s
1499:	learn: 0.0292629	total: 2m 25s	remaining: 0us
0:	learn: 0.6684011	total: 3.71ms	remaining: 5.56s
500:	learn: 0.0152105	total: 2.36s	remaining: 4.71s
1000:	learn: 0.0083918	total: 4.01s	remaining: 2s
1499:	learn: 0.0057760	total: 5.64s	remaining: 0us


In [21]:
model = CatBoostRegressor(random_state=1234,verbose=500,iterations=1500,learning_rate=0.033)
model.fit(trainA_31_x, trainA_31_y_c)
pred_a_r_c = model.predict(testA_31_x)
model.fit(train_T_31_x, train_T_31_y_c)
pred_t_r_c = model.predict(test_T_31_x)
model.fit(train_O_31_x, train_O_31_y_c)
pred_o_r_c = model.predict(test_O_31_x)
testA_31['Y_Class'] = pred_a_r_c
test_T_31['Y_Class'] = pred_t_r_c
test_O_31['Y_Class'] = pred_o_r_c
submita = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submitt = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submito = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')

submita = pd.merge(submita[['PRODUCT_ID']],testA_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submitt = pd.merge(submitt[['PRODUCT_ID']],test_T_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')
submito = pd.merge(submito[['PRODUCT_ID']],test_O_31[['PRODUCT_ID','Y_Class']],on='PRODUCT_ID')

reg = pd.concat([submita,submitt,submito]).sort_values(by='PRODUCT_ID')

0:	learn: 0.7183692	total: 135ms	remaining: 3m 21s
500:	learn: 0.0618939	total: 55.1s	remaining: 1m 49s
1000:	learn: 0.0060866	total: 1m 50s	remaining: 55s
1499:	learn: 0.0005781	total: 2m 45s	remaining: 0us
0:	learn: 0.4093482	total: 47.3ms	remaining: 1m 10s
500:	learn: 0.0796737	total: 19.7s	remaining: 39.4s
1000:	learn: 0.0156947	total: 44s	remaining: 22s
1499:	learn: 0.0023566	total: 1m 3s	remaining: 0us
0:	learn: 0.4671273	total: 4.76ms	remaining: 7.13s
500:	learn: 0.0034926	total: 1.51s	remaining: 3.01s
1000:	learn: 0.0000555	total: 4.61s	remaining: 2.3s
1499:	learn: 0.0000009	total: 7.29s	remaining: 0us


In [23]:
processing = pd.merge(proba,reg, on= 'PRODUCT_ID')

before = pd.read_csv('/content/final_test.csv')

use_post = pd.merge(before,processing, on = 'PRODUCT_ID')


use_post['Y_Class_x'][(use_post['Y_Class_x'] != 0) & (use_post['class0'] > 0.45) & (use_post['Y_Class_y'] <= 0.75)] = 0



submit = pd.read_csv('/content/drive/MyDrive/LG_Aimers2/open (7)/sample_submission.csv')
submit['Y_Class'] = use_post['Y_Class_x']
submit.to_csv('재현확인.csv',index=False)