# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

### 데이터 읽어오기


In [55]:
RANDOM_STATE = 110

train_data = pd.read_csv("../../data/train_data_0817.csv")
test_data = pd.read_csv("../../data/test_data_0817.csv")

In [56]:
train_data.info()
print('---')
# test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 38 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   CURE SPEED Collect Result_Dam                   40506 non-null  int64  
 1   DISCHARGED SPEED OF RESIN Collect Result_Dam    40506 non-null  int64  
 2   Head Clean Position Z Collect Result_Dam        40506 non-null  float64
 3   Head Purge Position Z Collect Result_Dam        40506 non-null  float64
 4   Head Zero Position Y Collect Result_Dam         40506 non-null  float64
 5   Stage2_Circle_Distance_Speed_Dam                40506 non-null  int64  
 6   WorkMode Collect Result                         40506 non-null  float64
 7   Chamber Temp. Collect Result_AutoClave          40506 non-null  int64  
 8   DISCHARGED SPEED OF RESIN Collect Result_Fill1  40506 non-null  float64
 9   Head Purge Position Z Collect Result_Fi

In [57]:
# dam, fill1, fill2 공통 변수
var_dam_fill = [
    'Equipment_same_num',
    'PalletID_Collect_Result_encoded',
    'Production_Qty_Collect_Result',
    'WorkMode Collect Result'
]

In [58]:
# 전체 공통 변수
### correlation 확인을 위한 변수 리스트
var_all_corr = [
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### train
var_all_train = [
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

### test
var_all_test = [
    'Set ID',
    'target',
    'model_receip_encoded',
    'workorder_receip_encoded'
]

In [59]:
# '_Dam'을 포함하는 변수 선택
dam_variables = [var for var in train_data.columns if '_Dam' in var]

# train
final_columns_train = var_dam_fill + var_all_train + dam_variables
train_data_dam = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + dam_variables
test_data_dam = test_data[final_columns_test]

In [60]:
# '_Fill1'을 포함하는 변수 선택
fill1_variables = [var for var in train_data.columns if '_Fill1' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill1_variables
train_data_fill1 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill1_variables
test_data_fill1 = test_data[final_columns_test]

In [61]:
# '_Fill2'을 포함하는 변수 선택
fill2_variables = [var for var in train_data.columns if '_Fill2' in var]

# train
final_columns_train = var_dam_fill + var_all_train + fill2_variables
train_data_fill2 = train_data[final_columns_train]

# test 
final_columns_test = var_dam_fill + var_all_test + fill2_variables
test_data_fill2 = test_data[final_columns_test]

In [62]:
# '_AutoClave'을 포함하는 변수 선택
autoclave_variables = [var for var in train_data.columns if '_AutoClave' in var]

# train
final_columns_train = var_all_train + autoclave_variables
train_data_autoclave = train_data[final_columns_train]

# test 
final_columns_test = var_all_test + autoclave_variables
test_data_autoclave = test_data[final_columns_test]

## 3. 모델 학습

### 모델 정의

In [63]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [64]:
import seaborn as sns
from pprint import pprint

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [65]:
import random

np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)
os.environ['PYTHONHASHSEED'] = str(RANDOM_STATE)

In [66]:
train_data_dam = train_data_dam.dropna(axis=1)
test_data_dam = test_data_dam.dropna(axis=1)

In [67]:
train_data_autoclave = train_data_autoclave.dropna(axis=1)
test_data_autoclave = test_data_autoclave.dropna(axis=1)

In [68]:
train_data_fill1 = train_data_fill1.dropna(axis=1)
test_data_fill1 = test_data_fill1.dropna(axis=1)

In [69]:
train_data_fill2 = train_data_fill2.dropna(axis=1)
test_data_fill2 = test_data_fill2.dropna(axis=1)

In [70]:
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']

In [71]:
X_train_dam = train_data_dam.drop(columns=['target'])

In [72]:
X_train_autoclave = train_data_autoclave.drop(columns=['target'])

In [73]:
X_train_fill1 = train_data_fill1.drop(columns=['target'])

In [74]:
X_train_fill2 = train_data_fill2.drop(columns=['target'])

In [75]:
nunique = X_train_dam.nunique()
types = X_train_dam.dtypes

categorical_columns_dam = []
categorical_dims_dam =  {}
for col in X_train_dam.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_dam[col].nunique())
        l_enc = LabelEncoder()
        X_train_dam[col] = X_train_dam[col].fillna("VV_likely")
        X_train_dam[col] = l_enc.fit_transform(X_train_dam[col].values)
        categorical_columns_dam.append(col)
        categorical_dims_dam[col] = len(l_enc.classes_)
    else:
        X_train_dam.fillna(train_data.loc[:, col].mean(), inplace=True)

In [76]:
nunique = X_train_autoclave.nunique()
types = X_train_autoclave.dtypes

categorical_columns_autoclave = []
categorical_dims_autoclave =  {}
for col in X_train_autoclave.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_autoclave[col].nunique())
        l_enc = LabelEncoder()
        X_train_autoclave[col] = X_train_autoclave[col].fillna("VV_likely")
        X_train_autoclave[col] = l_enc.fit_transform(X_train_autoclave[col].values)
        categorical_columns_autoclave.append(col)
        categorical_dims_autoclave[col] = len(l_enc.classes_)
    else:
        X_train_autoclave.fillna(train_data.loc[:, col].mean(), inplace=True)

In [77]:
nunique = X_train_fill1.nunique()
types = X_train_fill1.dtypes

categorical_columns_fill1 = []
categorical_dims_fill1 =  {}
for col in X_train_fill1.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_fill1[col].nunique())
        l_enc = LabelEncoder()
        X_train_fill1[col] = X_train_fill1[col].fillna("VV_likely")
        X_train_fill1[col] = l_enc.fit_transform(X_train_fill1[col].values)
        categorical_columns_fill1.append(col)
        categorical_dims_fill1[col] = len(l_enc.classes_)
    else:
        X_train_fill1.fillna(train_data.loc[:, col].mean(), inplace=True)

In [78]:
nunique = X_train_fill2.nunique()
types = X_train_fill2.dtypes

categorical_columns_fill2 = []
categorical_dims_fill2 =  {}
for col in X_train_fill2.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, X_train_fill2[col].nunique())
        l_enc = LabelEncoder()
        X_train_fill2[col] = X_train_fill2[col].fillna("VV_likely")
        X_train_fill2[col] = l_enc.fit_transform(X_train_fill2[col].values)
        categorical_columns_fill2.append(col)
        categorical_dims_fill2[col] = len(l_enc.classes_)
    else:
        X_train_fill2.fillna(train_data.loc[:, col].mean(), inplace=True)

In [79]:
# features = [ col for col in X_train.columns] 
# cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
# cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [80]:
# cat_idxs

In [81]:
from sklearn.model_selection import train_test_split

x_train_dam, x_valid_dam, y_train_dam, y_valid_dam = train_test_split(X_train_dam, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [82]:
x_train_autoclave, x_valid_autoclave, y_train_autoclave, y_valid_autoclave = train_test_split(X_train_autoclave, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [83]:
x_train_fill1, x_valid_fill1, y_train_fill1, y_valid_fill1 = train_test_split(X_train_fill1, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [84]:
x_train_fill2, x_valid_fill2, y_train_fill2, y_valid_fill2 = train_test_split(X_train_fill2, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=34)

In [85]:
features = [ col for col in X_train_dam.columns] 
cat_idxs_dam = [ i for i, f in enumerate(features) if f in categorical_columns_dam]
cat_dims_dam = [ categorical_dims_dam[f] for i, f in enumerate(features) if f in categorical_columns_dam]

In [86]:
features = [ col for col in X_train_autoclave.columns] 
cat_idxs_autoclave = [ i for i, f in enumerate(features) if f in categorical_columns_autoclave]
cat_dims_autoclave = [ categorical_dims_autoclave[f] for i, f in enumerate(features) if f in categorical_columns_autoclave]

In [87]:
features = [ col for col in X_train_fill1.columns] 
cat_idxs_fill1 = [ i for i, f in enumerate(features) if f in categorical_columns_fill1]
cat_dims_fill1 = [ categorical_dims_fill1[f] for i, f in enumerate(features) if f in categorical_columns_fill1]

In [88]:
features = [ col for col in X_train_fill2.columns] 
cat_idxs_fill2 = [ i for i, f in enumerate(features) if f in categorical_columns_fill2]
cat_dims_fill2 = [ categorical_dims_fill2[f] for i, f in enumerate(features) if f in categorical_columns_fill2]

In [89]:
x_train_dam_np = x_train_dam.values
y_train_dam_np = y_train_dam.values
x_valid_dam_np = x_valid_dam.values
y_valid_dam_np = y_valid_dam.values

In [90]:
x_train_autoclave_np = x_train_autoclave.values
y_train_autoclave_np = y_train_autoclave.values
x_valid_autoclave_np = x_valid_autoclave.values
y_valid_autoclave_np = y_valid_autoclave.values

In [91]:
x_train_fill1_np = x_train_fill1.values
y_train_fill1_np = y_train_fill1.values
x_valid_fill1_np = x_valid_fill1.values
y_valid_fill1_np = y_valid_fill1.values

In [92]:
x_train_fill2_np = x_train_fill2.values
y_train_fill2_np = y_train_fill2.values
x_valid_fill2_np = x_valid_fill2.values
y_valid_fill2_np = y_valid_fill2.values

## 모델 학습

In [93]:
clf_dam = TabNetClassifier(cat_idxs=cat_idxs_dam,
                       cat_dims=cat_dims_dam,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )



In [94]:
clf_autoclave = TabNetClassifier(cat_idxs=cat_idxs_autoclave,
                       cat_dims=cat_dims_autoclave,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [95]:
clf_fill1 = TabNetClassifier(cat_idxs=cat_idxs_fill1,
                       cat_dims=cat_dims_fill1,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [96]:
clf_fill2 = TabNetClassifier(cat_idxs=cat_idxs_fill2,
                       cat_dims=cat_dims_fill2,
                       cat_emb_dim=1000,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )

In [97]:
max_epochs = 20

clf_dam.fit(
    X_train=x_train_dam_np, y_train=y_train_dam_np,
    eval_set=[(x_train_dam_np, y_train_dam_np), (x_valid_dam_np, y_valid_dam_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.6661  | train_auc: 0.44545 | valid_auc: 0.44618 |  0:00:02s
epoch 1  | loss: 0.61291 | train_auc: 0.52764 | valid_auc: 0.5086  |  0:00:04s
epoch 2  | loss: 0.59982 | train_auc: 0.55736 | valid_auc: 0.54902 |  0:00:06s
epoch 3  | loss: 0.59439 | train_auc: 0.57663 | valid_auc: 0.57354 |  0:00:09s
epoch 4  | loss: 0.59269 | train_auc: 0.59641 | valid_auc: 0.5988  |  0:00:11s
epoch 5  | loss: 0.59073 | train_auc: 0.58723 | valid_auc: 0.59156 |  0:00:13s
epoch 6  | loss: 0.58638 | train_auc: 0.60853 | valid_auc: 0.63196 |  0:00:15s
epoch 7  | loss: 0.59224 | train_auc: 0.6144  | valid_auc: 0.6298  |  0:00:17s
epoch 8  | loss: 0.58811 | train_auc: 0.6278  | valid_auc: 0.63105 |  0:00:19s
epoch 9  | loss: 0.59046 | train_auc: 0.66651 | valid_auc: 0.67573 |  0:00:21s
epoch 10 | loss: 0.59067 | train_auc: 0.71334 | valid_auc: 0.72293 |  0:00:23s
epoch 11 | loss: 0.5893  | train_auc: 0.71597 | valid_auc: 0.73203 |  0:00:26s
epoch 12 | loss: 0.58978 | train_auc: 0.72678 | vali



In [98]:
max_epochs = 15

clf_autoclave.fit(
    X_train=x_train_autoclave_np, y_train=y_train_autoclave_np,
    eval_set=[(x_train_autoclave_np, y_train_autoclave_np), (x_valid_autoclave_np, y_valid_autoclave_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.66857 | train_auc: 0.50101 | valid_auc: 0.46036 |  0:00:02s
epoch 1  | loss: 0.61162 | train_auc: 0.51894 | valid_auc: 0.5026  |  0:00:04s
epoch 2  | loss: 0.60437 | train_auc: 0.50784 | valid_auc: 0.50606 |  0:00:06s
epoch 3  | loss: 0.59948 | train_auc: 0.57685 | valid_auc: 0.53429 |  0:00:08s
epoch 4  | loss: 0.5973  | train_auc: 0.60286 | valid_auc: 0.62226 |  0:00:10s
epoch 5  | loss: 0.59733 | train_auc: 0.67166 | valid_auc: 0.67439 |  0:00:12s
epoch 6  | loss: 0.59266 | train_auc: 0.71787 | valid_auc: 0.72391 |  0:00:15s
epoch 7  | loss: 0.59556 | train_auc: 0.72638 | valid_auc: 0.73567 |  0:00:17s
epoch 8  | loss: 0.59436 | train_auc: 0.73389 | valid_auc: 0.73694 |  0:00:19s
epoch 9  | loss: 0.59236 | train_auc: 0.73434 | valid_auc: 0.73663 |  0:00:21s
epoch 10 | loss: 0.59334 | train_auc: 0.73442 | valid_auc: 0.73793 |  0:00:23s
epoch 11 | loss: 0.59444 | train_auc: 0.7372  | valid_auc: 0.74246 |  0:00:25s
epoch 12 | loss: 0.59296 | train_auc: 0.73599 | vali



In [99]:
max_epochs = 20

clf_fill1.fit(
    X_train=x_train_fill1_np, y_train=y_train_fill1_np,
    eval_set=[(x_train_fill1_np, y_train_fill1_np), (x_valid_fill1_np, y_valid_fill1_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.69929 | train_auc: 0.52629 | valid_auc: 0.52712 |  0:00:02s
epoch 1  | loss: 0.62176 | train_auc: 0.53628 | valid_auc: 0.52638 |  0:00:04s
epoch 2  | loss: 0.60622 | train_auc: 0.61683 | valid_auc: 0.62179 |  0:00:06s
epoch 3  | loss: 0.59429 | train_auc: 0.55022 | valid_auc: 0.58248 |  0:00:09s
epoch 4  | loss: 0.59326 | train_auc: 0.62854 | valid_auc: 0.64247 |  0:00:11s
epoch 5  | loss: 0.5913  | train_auc: 0.72186 | valid_auc: 0.73298 |  0:00:13s
epoch 6  | loss: 0.58848 | train_auc: 0.7276  | valid_auc: 0.73351 |  0:00:15s
epoch 7  | loss: 0.59112 | train_auc: 0.73128 | valid_auc: 0.73644 |  0:00:17s
epoch 8  | loss: 0.5878  | train_auc: 0.7347  | valid_auc: 0.74222 |  0:00:20s
epoch 9  | loss: 0.58593 | train_auc: 0.73985 | valid_auc: 0.74975 |  0:00:22s
epoch 10 | loss: 0.58779 | train_auc: 0.74161 | valid_auc: 0.75091 |  0:00:24s
epoch 11 | loss: 0.58755 | train_auc: 0.74265 | valid_auc: 0.75296 |  0:00:26s
epoch 12 | loss: 0.58866 | train_auc: 0.74386 | vali



In [100]:
max_epochs = 20

clf_fill2.fit(
    X_train=x_train_fill2_np, y_train=y_train_fill2_np,
    eval_set=[(x_train_fill2_np, y_train_fill2_np), (x_valid_fill2_np, y_valid_fill2_np)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 0.66426 | train_auc: 0.72722 | valid_auc: 0.73231 |  0:00:02s
epoch 1  | loss: 0.61215 | train_auc: 0.44229 | valid_auc: 0.4526  |  0:00:04s
epoch 2  | loss: 0.6016  | train_auc: 0.6941  | valid_auc: 0.70763 |  0:00:06s
epoch 3  | loss: 0.59578 | train_auc: 0.63696 | valid_auc: 0.6522  |  0:00:08s
epoch 4  | loss: 0.59719 | train_auc: 0.64035 | valid_auc: 0.66394 |  0:00:11s
epoch 5  | loss: 0.59375 | train_auc: 0.64922 | valid_auc: 0.65686 |  0:00:13s
epoch 6  | loss: 0.58712 | train_auc: 0.72113 | valid_auc: 0.73681 |  0:00:15s
epoch 7  | loss: 0.59084 | train_auc: 0.73053 | valid_auc: 0.74679 |  0:00:17s
epoch 8  | loss: 0.59153 | train_auc: 0.72063 | valid_auc: 0.73304 |  0:00:20s
epoch 9  | loss: 0.58804 | train_auc: 0.71474 | valid_auc: 0.73137 |  0:00:22s
epoch 10 | loss: 0.58857 | train_auc: 0.72453 | valid_auc: 0.73631 |  0:00:24s
epoch 11 | loss: 0.5885  | train_auc: 0.73506 | valid_auc: 0.74575 |  0:00:26s
epoch 12 | loss: 0.58882 | train_auc: 0.73715 | vali



In [106]:
test_data_dam = test_data_dam.drop(columns = ['Set ID'])
test_data_autoclave = test_data_autoclave.drop(columns = ['Set ID'])
test_data_fill1 = test_data_fill1.drop(columns = ['Set ID'])
test_data_fill2 = test_data_fill2.drop(columns = ['Set ID'])

In [107]:
nunique = test_data_dam.nunique()
types = test_data_dam.dtypes

for col in test_data_dam.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_dam[col].nunique())
        l_enc = LabelEncoder()
        test_data_dam[col] = l_enc.fit_transform(test_data_dam[col].values)
    else:
        test_data_dam.fillna(train_data.loc[:, col].mean(), inplace=True)

In [108]:
nunique = test_data_autoclave.nunique()
types = test_data_autoclave.dtypes

for col in test_data_autoclave.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_autoclave[col].nunique())
        l_enc = LabelEncoder()
        test_data_autoclave[col] = l_enc.fit_transform(test_data_autoclave[col].values)
    else:
        test_data_autoclave.fillna(train_data.loc[:, col].mean(), inplace=True)

In [109]:
nunique = test_data_fill1.nunique()
types = test_data_fill1.dtypes

for col in test_data_fill1.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_fill1[col].nunique())
        l_enc = LabelEncoder()
        test_data_fill1[col] = l_enc.fit_transform(test_data_fill1[col].values)
    else:
        test_data_fill1.fillna(train_data.loc[:, col].mean(), inplace=True)

In [110]:
nunique = test_data_fill2.nunique()
types = test_data_fill2.dtypes

for col in test_data_fill2.columns:
    if types[col] == 'object' or nunique[col] < 2:
        print(col, test_data_fill2[col].nunique())
        l_enc = LabelEncoder()
        test_data_fill2[col] = l_enc.fit_transform(test_data_fill2[col].values)
    else:
        test_data_fill2.fillna(train_data.loc[:, col].mean(), inplace=True)

In [111]:
x_test_dam_np = test_data_dam.values

In [112]:
x_test_autoclave_np = test_data_autoclave.values

In [113]:
x_test_fill1_np = test_data_fill1.values

In [114]:
x_test_fill2_np = test_data_fill2.values

In [116]:
# x_test_dam_np의 shape 확인
print(f"x_test_dam_np shape: {x_test_dam_np.shape}")

# 모델이 학습된 데이터의 shape 확인 (예시로 x_train_dam_np 사용)
print(f"x_train_dam_np shape: {x_train_dam_np.shape}")

x_test_dam_np shape: (17361, 19)
x_train_dam_np shape: (32404, 20)


In [115]:
preds_dam = clf_dam.predict_proba(x_test_dam_np)

RuntimeError: running_mean should contain 19 elements not 20

In [377]:
preds_autoclave = clf_autoclave.predict_proba(x_test_autoclave_np)

In [378]:
preds_fill1 = clf_fill1.predict_proba(x_test_fill1_np)

In [379]:
preds_fill2 = clf_fill2.predict_proba(x_test_fill2_np)

In [425]:
n=0
j=0
for i in preds_dam:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [426]:
len(preds_dam)

17361

In [427]:
print(n)
print(j)

6447
10914


In [383]:
n=0
j=0
for i in preds_autoclave:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [384]:
print(n)
print(j)

6543
10818


In [385]:
n=0
j=0
for i in preds_fill1:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [386]:
print(n)
print(j)

6329
11032


In [387]:
n=0
j=0
for i in preds_fill2:
    if i[0] > i[1]:
        n = n+1
    else:
        j = j+1

In [388]:
print(n)
print(j)

6072
11289


In [465]:
df_target_dam = np.where(preds_dam[:,1] >= 0.5, 1, 0)

In [466]:
df_target_total = pd.DataFrame(df_target_dam, columns=['target_dam'])

In [467]:
df_target_total.value_counts()

target_dam
1             10914
0              6447
Name: count, dtype: int64

In [468]:
df_target_autoclave = np.where(preds_autoclave[:,1] >= 0.5, 1, 0)

In [469]:
df_target_fill1 = np.where(preds_fill1[:,1] >= 0.5, 1, 0)

In [470]:
df_target_fill2 = np.where(preds_fill2[:,1] >= 0.5, 1, 0)

In [471]:
df_target_autoclave = pd.DataFrame(df_target_autoclave, columns=['target_autoclave'])

In [473]:
df_target_fill1 = pd.DataFrame(df_target_fill1, columns=['target_fill1'])

In [472]:
df_target_fill2 = pd.DataFrame(df_target_fill2, columns=['target_fill2'])

In [475]:
df_target_all = pd.concat([df_target_total, df_target_autoclave, df_target_fill1, df_target_fill2], axis=1)

In [478]:
for col in df_target_all.columns :
    cnt = df_target_all[col].value_counts()
    print(f"{cnt}\n")

target_dam
1    10914
0     6447
Name: count, dtype: int64

target_autoclave
1    10818
0     6543
Name: count, dtype: int64

target_fill1
1    11032
0     6329
Name: count, dtype: int64

target_fill2
1    11289
0     6072
Name: count, dtype: int64



In [496]:
df_target_all['sum'] = df_target_all[['target_dam', 'target_autoclave', 'target_fill1', 'target_fill2']].sum(axis=1)

# 1의 개수가 2개 이상이면 "Normal", 그렇지 않으면 "AbNormal"
df_target_all['final'] = df_target_all['sum'].apply(lambda x: 'Normal' if x >= 1 else 'AbNormal')

# 중간 계산에 사용한 'sum_ones' 열 제거 (선택 사항)
df_target_all.drop(columns=['sum'], inplace=True)

In [497]:
df_target_all['final'].value_counts()

final
Normal      13937
AbNormal     3424
Name: count, dtype: int64

In [498]:
submission = pd.read_csv('submission.csv')

In [500]:
submission['target'] = df_target_all['final']

In [501]:
submission['target'].value_counts()

target
Normal      13937
AbNormal     3424
Name: count, dtype: int64

In [502]:
submission.to_csv("submission.csv", index=False)

In [None]:
#일단 0.173874점 기록했음... 좀 더 올릴수는 있을거라고 생각함돠

In [505]:
# from sklearn.metrics import accuracy_score

# accuracy = accuracy_score(y_valid_dam_np, preds_dam[:,1])

In [506]:
# from sklearn.metrics import f1_score

# f1_score = f1_score(y_valid_dam_np, preds_dam[:,1], average='micro')

In [507]:
# accuracy

In [508]:
# f1_score

In [509]:
# correct_predictions = np.sum(y_valid_dam_np ==  df_target)

In [510]:
# incorrect_predictions = np.sum(y_valid_dam_np != df_target)

In [511]:
# correct_predictions

In [512]:
# incorrect_predictions

In [437]:
# true_positives = np.sum((y_valid_dam_np == 1) & (df_target == 1))
# true_negatives = np.sum((y_valid_dam_np == 0) & (df_target == 0))

In [438]:
# true_positives

In [439]:
# true_negatives

In [440]:
# y_valid_dam.value_counts()

In [441]:
# false_positives = np.sum((y_valid_dam_np == 1) & (df_target == 0))

# # y_pred가 0이고 y_true가 1인 경우의 개수 (False Negatives)
# false_negatives = np.sum((y_valid_dam_np == 0) & (df_target == 1))

In [442]:
# false_positives

In [443]:
# false_negatives