In [22]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import catboost
from catboost import *
import sklearn
from sklearn.preprocessing import LabelEncoder
import sys

from itertools import product
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import copy

In [23]:
train_df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submission.csv')

In [24]:
def del_columns(train, test):
    col_list = train.columns
    nan_list = []
    nan_cnt = []
    nan_col = []
    full_list = []
    for col in col_list:
        if train[col].isnull().sum() == 0 :
            full_list.append(col)
            continue
        nan_list.append([col, train[col].isnull().sum()])
        nan_cnt.append(train[col].isnull().sum())
        nan_col.append(col)

    del_col = []
    for col in nan_list :
        if col[1] == len(train) :
            del_col.append(col[0])
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)

    del_col = []
    col_list = train.describe().columns
    for col in col_list :
        if col == 'Y_Class':
            continue
        if col == 'Y_Quality':
            continue
        if col == 'LINE':
            continue
        if col == 'PRODUCT_CODE':
            continue
        if train[col].nunique()==1 :
            del_col.append(col)
    train = train.drop(columns=del_col)
    test = test.drop(columns=del_col)
    
    return train,test

def make_train_test_dataset(train,test):
    
    '''
    트레인데이터, 학습데이터 셋 만들기
    '''
    
    train_x = train.drop(columns=['PRODUCT_ID','PRODUCT_CODE','Y_Class','Y_Quality'])
    test_x = test.drop(columns=['PRODUCT_ID','PRODUCT_CODE'])
    train_y = train['Y_Quality']
    train_w = train[['Y_Class']]
    return train_x, test_x, train_y, train_w


def fillna(train,test,value):
    train = train.fillna(value)
    test = test.fillna(value)
    return train,test

def labelencoder(train,test,col_list):
    
    qual_col = col_list
    for i in qual_col:
        le = LabelEncoder()
        le = le.fit(train[i])
        train[i] = le.transform(train[i])

        for label in np.unique(test[i]): 
            if label not in le.classes_: 
                le.classes_ = np.append(le.classes_, label)
        test[i] = le.transform(test[i]) 
    return train,test

In [25]:
train, test = fillna(train_df,test,-1)
train['PRODUCT_CODE'] = train['PRODUCT_CODE'].astype('category')
train['LINE'] = train['LINE'].astype('category')
test['PRODUCT_CODE'] = test['PRODUCT_CODE'].astype('category')
test['LINE'] = test['LINE'].astype('category')

In [26]:
rows_with_all_minus_one_tr = []

# 2880번째 열부터 마지막 열까지 각 행에 대해 반복
for index, row in train.iloc[:, 2880:].iterrows():
    # 행의 모든 값이 -1인지 확인
    if all(value == -1 for value in row):
        rows_with_all_minus_one_tr.append(index)

# 결과를 저장할 빈 데이터프레임 생성
result_data_tr = train.iloc[rows_with_all_minus_one_tr]

rows_with_all_minus_one_te = []

# 2880번째 열부터 마지막 열까지 각 행에 대해 반복
for index, row in test.iloc[:, 2880:].iterrows():
    # 행의 모든 값이 -1인지 확인
    if all(value == -1 for value in row):
        rows_with_all_minus_one_te.append(index)

# 결과를 저장할 빈 데이터프레임 생성
result_data_te = test.iloc[rows_with_all_minus_one_te]

train_bf = train.iloc[rows_with_all_minus_one_tr]
test_bf = test.iloc[rows_with_all_minus_one_te]
submit_bf = submit.iloc[rows_with_all_minus_one_te]

train_af= train.drop(result_data_tr.index)
test_af= test.drop(result_data_te.index)
submit_af= submit.drop(result_data_te.index)

train_bf = train_bf.reset_index(drop=True)
test_bf = test_bf.reset_index(drop=True)

train_af = train_af.reset_index(drop=True)
test_af = test_af.reset_index(drop=True)

In [27]:
def get_best_threshold_spliter(y_pred, y_true_cls, model=None, training=True):
    if training:
        search_space = [[2, 4, 6], [1, 3, 5, 7, 9]]
        best_score = -np.inf
        output_pred = []
        model = None
        for depth, min_samples in product(*search_space): 
            model = DecisionTreeClassifier(
                criterion="gini", max_features=1.0,
                max_depth=depth, min_samples_leaf=min_samples, random_state=42
            )
            model.fit(y_pred, y_true_cls)
            y_pred_cls = model.predict(y_pred)
            score = metrics.f1_score(y_true_cls, y_pred_cls, average="macro")
            if best_score < score:
                best_score = score
                print(f"Best score : {best_score}")
                output_pred = y_pred_cls.copy()
                model = copy.deepcopy(model)
        return model, output_pred
    else:
        output_pred = model.predict(y_pred)
        return output_pred

In [28]:
train_bf, test_bf = del_columns(train_bf,test_bf)
train_af, test_af = del_columns(train_af,test_af)

In [29]:
train_x_bf, test_x_bf, train_y_bf, train_w_bf = make_train_test_dataset(train_bf,test_bf)
train_x_af, test_x_af, train_y_af, train_w_af = make_train_test_dataset(train_af,test_af)

In [30]:
train_x_bf = train_x_bf.T.drop_duplicates().T
train_x_af = train_x_af.T.drop_duplicates().T

test_x_bf = test_x_bf[train_x_bf.columns]
test_x_af = test_x_af[train_x_af.columns]

In [31]:
train_x_bf.shape, train_x_af.shape, test_x_bf.shape, test_x_af.shape

((765, 1719), (367, 290), (167, 2798), (368, 524))

#Train before

In [11]:
bf_stats = pd.DataFrame({"qual": train_y_bf, "cls": train_w_bf["Y_Class"]}).groupby("cls")["qual"].describe()
af_stats = pd.DataFrame({"qual": train_y_af, "cls": train_w_af["Y_Class"]}).groupby("cls")["qual"].describe()

In [12]:
clf = catboost.CatBoostRegressor(
    learning_rate=0.05,
    iterations=1000,
    depth=6,
    l2_leaf_reg=5,
    border_count=254,
    cat_features=['LINE'],
    random_seed=313,
    verbose=False,
)

In [13]:
clf.fit(train_x_bf, train_y_bf)
train_pred_bf = clf.predict(train_x_bf)
pred_bf = clf.predict(test_x_bf)

In [14]:
reg_prob = np.abs(np.stack([
    train_pred_bf - bf_stats.loc[0, "mean"],
    train_pred_bf - bf_stats.loc[1, "mean"],
    train_pred_bf - bf_stats.loc[2, "mean"],
], axis=1))
model_threshold, train_pred_bf = get_best_threshold_spliter(reg_prob, train_w_bf["Y_Class"].values, training=True)

reg_prob = np.abs(np.stack([
    pred_bf - bf_stats.loc[0, "mean"],
    pred_bf - bf_stats.loc[1, "mean"],
    pred_bf - bf_stats.loc[2, "mean"],
], axis=1))
pred_bf = get_best_threshold_spliter(reg_prob, y_true_cls=None, model=model_threshold, training=False)

Best score : 0.9606249652387597
Best score : 0.9805198086421779
Best score : 0.9929167675360938


#Train after

In [15]:
#clf = catboost.CatBoostClassifier(verbose=0,cat_features=['LINE'])
clf.fit(train_x_af, train_y_af)
train_pred_af = clf.predict(train_x_af)
pred_af = clf.predict(test_x_af)

In [16]:
reg_prob = np.abs(np.stack([
    train_pred_af - af_stats.loc[0, "mean"],
    train_pred_af - af_stats.loc[1, "mean"],
    train_pred_af - af_stats.loc[2, "mean"],
], axis=1))
model_threshold, train_pred_af = get_best_threshold_spliter(reg_prob, train_w_af["Y_Class"].values, training=True)

reg_prob = np.abs(np.stack([
    pred_af - af_stats.loc[0, "mean"],
    pred_af - af_stats.loc[1, "mean"],
    pred_af - af_stats.loc[2, "mean"],
], axis=1))
pred_af = get_best_threshold_spliter(reg_prob, y_true_cls=None, model=model_threshold, training=False)

Best score : 0.9923042220629993
Best score : 1.0


Submission

In [17]:
submit_bf['Y_Class'] = pred_bf
submit_af['Y_Class'] = pred_af

In [18]:
submit_fin = pd.concat([submit_bf, submit_af],axis=0)
submit_fin = submit_fin.sort_values('PRODUCT_ID')
submit_fin.to_csv('reg_1_changeseed_313_dt.csv', index=False)

In [19]:
submit_fin['Y_Class'].value_counts()

1    415
2     72
0     48
Name: Y_Class, dtype: int64

In [20]:
submit_fin

Unnamed: 0,PRODUCT_ID,Y_Class
0,TEST_000,1
1,TEST_001,1
2,TEST_002,1
3,TEST_003,1
4,TEST_004,1
...,...,...
530,TEST_530,1
531,TEST_531,1
532,TEST_532,1
533,TEST_533,1


In [37]:
import joblib

joblib.dump(clf, '0.741.pkl')

['0.741.pkl']