In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
import xgboost as xgb
import bisect

from scipy import stats
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import DBSCAN

In [2]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\train.csv').drop(['X1'], axis=1)
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\test.csv').drop(['Id', 'X1'], axis=1)

display(train.head())
display(test.head())

Unnamed: 0,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,56.3,67.5,22.5,AH32,4,97,PL973,467,2241
1,115.0,67.0,13.0,AH32,2,27,PL271,1058,1163
2,59.2,44.6,18.0,AH32-TM,1,14,PL141,270,1094
3,52.1,97.4,15.0,A,17,24,PL242,391,2604
4,42.9,58.7,14.5,A,4,23,PL233,232,1564


Unnamed: 0,X2,X3,X4,X5,X6,X7,X8
0,80.6,104.8,11.0,A,29,23,PL234
1,141.8,34.2,17.5,A,2,91,PL911
2,77.2,103.3,17.0,A,10,97,PL973
3,95.5,131.0,18.0,A,2,23,PL234
4,91.7,36.1,19.0,AH32,1,27,PL271


| 항목 | 설명     |
|-----|----------|
| X1  | 작업번호 |
| X2  | 마킹길이 |
| X3  | 절단길이 |
| X4  | 철판두께 |
| X5  | 철판재질 |
| X6  | 절단갯수 |
| X7  | 작업장   |
| X8  | 작업자   |
| Y1  | 마킹시간 |
| Y2  | 절단시간 |

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train['X4'].unique()

# 특징 만들기


In [None]:
# train['X8'] = train['X8'].str.extract(r'(PL|PS)')
# test['X8'] = test['X8'].str.extract(r'(PL|PS)')

In [None]:
# train['X8_encoded'] = (train['X8'] != 'PL').astype(int)
# test['X8_encoded'] = (test['X8'] != 'PL').astype(int)

# train = pd.get_dummies(train, columns=['X8_encoded'], prefix=['X8'])
# test = pd.get_dummies(test, columns=['X8_encoded'], prefix=['X8'])

# train[['X8_0', 'X8_1']] = train[['X8_0', 'X8_1']].astype(int)
# test[['X8_0', 'X8_1']] = test[['X8_0', 'X8_1']].astype(int)

In [3]:
mmtime = train.groupby('X5')['Y1'].mean()
mctime = train.groupby('X5')['Y2'].mean()

wmtime = train.groupby('X7')['Y1'].mean()
wctime = train.groupby('X7')['Y2'].mean()

manmtime = train.groupby('X8')['Y1'].mean()
manctime = train.groupby('X8')['Y2'].mean()

train['X5_mmtime'] = train['X5'].map(mmtime)
train['X5_mctime'] = train['X5'].map(mctime)

train['X7_wmtime'] = train['X7'].map(wmtime)
train['X7_wctime'] = train['X7'].map(wctime)

train['X8_manmtime'] = train['X8'].map(manmtime)
train['X8_manctime'] = train['X8'].map(manctime)

test['X5_mmtime'] = test['X5'].map(mmtime)
test['X5_mctime'] = test['X5'].map(mctime)

test['X7_wmtime'] = test['X7'].map(wmtime)
test['X7_wctime'] = test['X7'].map(wctime)

test['X8_manmtime'] = test['X8'].map(manmtime)
test['X8_manctime'] = test['X8'].map(manctime)

In [4]:
import re

def convert_x5(value):
    # -TM 이 포함되어 있으면 제거하고 숫자로 변환, 그렇지 않으면 그냥 숫자로 변환
    if '-TM' in value:
        value = value.replace('-TM', '')
    match = re.search(r'\d+', value)  # 정규표현식을 사용하여 숫자 추출
    if match:
        return int(match.group()) * 9.8
    else:
        return 235  # 숫자가 없는 경우 235로 설정

# X5 열에 변환 함수 적용
train['X5_numeric'] = train['X5'].apply(convert_x5)
test['X5_numeric'] = test['X5'].apply(convert_x5)

train.drop(columns = ['X5'], inplace = True)
test.drop(columns = ['X5'], inplace = True)

In [None]:
# def estimate_rotation_speed(X2, X3, X4, X5_numeric, X6):
#     rotation_speed = X2 + (X3 / X4) * (X5_numeric / 10) * (X6 / 2)
#     return rotation_speed

# train['Estimated_Rotation_Speed'] = estimate_rotation_speed(train['X2'], train['X3'], train['X4'], train['X5_numeric'], train['X6'])
# test['Estimated_Rotation_Speed'] = estimate_rotation_speed(test['X2'], test['X3'], test['X4'], test['X5_numeric'], test['X6'])

In [5]:
train['X2/X3'] = train['X2'] / train['X3']
test['X2/X3'] = test['X2'] / test['X3']

In [6]:
train['SPT'] = train['X5_numeric'] / train['X4']
test['SPT'] = test['X5_numeric'] / test['X4']

In [7]:
train['SPTCC'] = train['SPT'] * train['X6']
test['SPTCC'] = test['SPT'] * test['X6']

In [8]:
# X2_idx = train[(train['X2'] > 440)].index # 440
# X3_idx = train[(train['X3'] > 550)].index # 550
# X4_idx = train[(train['X4'] > 36)].index
# X6_idx = train[(train['X6'] > 250)].index # 250
# Y1_idx = train[(train['Y1'] == 0) | (train['Y1'] > 70000)].index
# Y2_idx = train[(train['Y2'] == 0) | (train['Y2'] > 70000)].index

# train.loc[X2_idx, 'X2'] = train.loc[(train ['X2'] >= 500), 'X2'].mean()
# train.loc[X3_idx, 'X3'] = train.loc[(train ['X3'] >= 600), 'X3'].mean()
# train.loc[X4_idx, 'X4'] = train.loc[(train ['X4'] >= 36), 'X4'].mean()
# train.loc[X6_idx, 'X6'] = train.loc[(train ['X6'] >= 265), 'X6'].mean()
# train.loc[Y1_idx, 'Y1'] = train.loc[(train['Y1'] != 0) & (train['Y1'] <= 70000), 'Y1'].mean()
# train.loc[Y2_idx, 'Y2'] = train.loc[(train['Y2'] != 0) & (train['Y2'] <= 70000), 'Y2'].mean()

def remove_outliers_with_k(column, k=1.5):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (k * IQR)
    upper_bound = Q3 + (k * IQR)
    return (column >= lower_bound) & (column <= upper_bound)

train = train.reset_index(drop=True)

In [10]:
mttrain = train[['X2','X3','X8','Y1']]

mtdrop = mttrain[mttrain['Y1'] == 0].index
mttrain.drop(mtdrop, inplace = True)

mttest = test[['X2','X3','X8']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mttrain.drop(mtdrop, inplace = True)


In [11]:
scaler = MinMaxScaler()

mtscale = mttrain.copy()
mtscale[['X2','X3','Y1']] = scaler.fit_transform(mttrain[['X2','X3','Y1']])

In [12]:
X2 = mtscale[['X2','Y1']].values
dbscan = DBSCAN(eps=0.01, min_samples=10).fit(X2)
mtscale['clusterX2'] = dbscan.fit_predict(mtscale[['X2','Y1']])
labels = dbscan.labels_
print(pd.Series(labels).value_counts())

In [None]:
idx_X2 = mtscale[mtscale['clusterX2'] == -1].index

In [None]:
X3 = mtscale[['X3','Y1']].values
dbscan = DBSCAN(eps=0.01, min_samples=10).fit(X3)
mtscale['clusterX3'] = dbscan.fit_predict(mtscale[['X3','Y1']])
labels = dbscan.labels_
print(pd.Series(labels).value_counts())

In [None]:
idx_X3 = mtscale[mtscale['clusterX3'] == -1].index

In [None]:
idx = idx_X2.append(idx_X3)
idx = set(idx)
mttrain.drop(idx , inplace=True)

In [None]:
cttrain = train[['X3','X4', 'X5_mctime','X8_manctime','Y2']]

ctdrop = cttrain[cttrain['Y2'] == 0].index
cttrain.drop(mtdrop, inplace = True)

cttest = test[['X3','X4', 'X5_mctime','X8_manctime']]

In [None]:
scaler = MinMaxScaler()

ctscale = cttrain.copy()
ctscale[['X3','Y2']] = scaler.fit_transform(cttrain[['X3','Y2']])

In [None]:
X= ctscale[['X3','Y2']].values
dbscan = DBSCAN(eps=0.01, min_samples=15).fit(X)
ctscale['cluster'] = dbscan.fit_predict(ctscale[['X3','Y2']])
labels = dbscan.labels_
print(pd.Series(labels).value_counts())

In [None]:
idx = ctscale[ctscale['cluster'] == -1].index
cttrain.drop(idx , inplace=True)

In [None]:
# scaler = MinMaxScaler()

# columns_to_scale = ['X2', 'X3', 'X4', 'X6', 'X2/X3', 'SPT', 'SPTCC']

# train[columns_to_scale] = scaler.fit_transform(train[columns_to_scale])
# test[columns_to_scale] = scaler.transform(test[columns_to_scale])

In [None]:
# columns_to_dbscan = ['X2', 'X3', 'X4', 'X6', 'X2/X3', 'X5_numeric', 'Estimated_Rotation_Speed']

# dbscan = DBSCAN(eps=5, min_samples=5)
# train['cluster'] = dbscan.fit_predict(train[columns_to_dbscan])

# # 클러스터가 -1인 것은 이상치로 간주하고 제거
# train = train[train['cluster'] != -1].drop(columns=['cluster'])

# train

In [None]:
# train

In [None]:
# exclude_columns = ['cluster']

# num_columns = len(filtered_train.columns) - len(exclude_columns)
# num_rows = (num_columns - 1) // 4 + 1 

# fig, axes = plt.subplots(nrows=num_rows, ncols=4, figsize=(16, num_rows * 4))
# axes = axes.flatten()

# index = 0
# for i, column in enumerate(filtered_train.columns):
#     if column not in exclude_columns:
#         sns.histplot(data=filtered_train, x=column, kde=True, ax=axes[index])
#         axes[index].set_title(f'Histogram of {column}')
#         axes[index].set_xlabel('Values')
#         axes[index].set_ylabel('Frequency')
#         index += 1

# for i in range(index, len(axes)):
#     fig.delaxes(axes[i])

# plt.tight_layout()  
# plt.show()

In [None]:
train['Y1'] = np.log1p(train['Y1'])
train['Y2'] = np.log1p(train['Y2'])

In [None]:
train['Y1'].hist(bins = 100)

In [None]:
train['Y2'].hist(bins = 100)

In [None]:
train = train[(train['Y1'] > 3) & (train['Y1'] < 9)]
train['Y1'].hist(bins=100)

In [None]:
train = train[(train['Y2'] > 4) & (train['Y1'] < 10)]
train['Y2'].hist(bins=100)

## CatBoost

In [None]:
cat_y1 = CatBoostRegressor(loss_function='MAE',
                           cat_features=['X8'],  # Add categorical features index if any
                           verbose=50
                           )

X_train_reduced = train.drop(columns=['Y1', 'Y2']).reset_index(drop=True)
y_train = train['Y1'].reset_index(drop=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

ensemble_predictions = []
scores = []

y_train = y_train.values

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]

    cat_y1.fit(X_t, y_t, eval_set=(X_val, y_val))

    val_pred = cat_y1.predict(X_val)

    scores.append(mean_absolute_error(y_val, val_pred))

    test = test[X_train_reduced.columns]

    cat_pred = cat_y1.predict(test)
    cat_pred = np.where(cat_pred < 0, 0, cat_pred)

    ensemble_predictions.append(cat_pred)

final_predictions_y1 = np.mean(ensemble_predictions, axis=0)

print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

In [None]:
cat_y2 = CatBoostRegressor(loss_function='MAE',
                           cat_features=['X8'],  # Add categorical features index if any
                           verbose=50
                           )

X_train_reduced = train.drop(columns=['Y1', 'Y2']).reset_index(drop=True)
y_train = train['Y2'].reset_index(drop=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

ensemble_predictions = []
scores = []

y_train = y_train.values

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]

    cat_y2.fit(X_t, y_t, eval_set=(X_val, y_val))

    val_pred = cat_y2.predict(X_val)

    scores.append(mean_absolute_error(y_val, val_pred))

    cat_pred = cat_y2.predict(test)
    cat_pred = np.where(cat_pred < 0, 0, cat_pred)

    ensemble_predictions.append(cat_pred)

final_predictions_y2 = np.mean(ensemble_predictions, axis=0)

print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

## MLJAR

In [None]:
!pip install mljar-supervised

In [None]:
categorical_features = ['X8']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

In [None]:
from supervised.automl import AutoML

train_x1 = train.drop(['Y1', 'Y2'], axis=1)
train_x2 = train.drop(['Y1', 'Y2'], axis=1)

train_y1 = train['Y1']
train_y2 = train['Y2']


Cross_validation = {
    "validation_type": "kfold",
    "k_folds": 10,
    "shuffle": True,
    "random_seed": 112
}

In [None]:
automl_y1 = AutoML(mode="Compete", algorithms = ['Decision Tree', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, eval_metric='mae', validation_strategy=Cross_validation, ml_task = "regression",
                   total_time_limit = 1800 * 2 * 5)
automl_y1.fit(train_x1, train_y1)

In [None]:
automl_y2 = AutoML(mode="Compete", algorithms = ['Decision Tree', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, eval_metric='mae', validation_strategy=Cross_validation, ml_task = "regression",
                   total_time_limit=1800 * 2 * 5)
automl_y2.fit(train_x2, train_y2)

In [None]:
pred_y1 = automl_y1.predict_all(test)
pred_y2 = automl_y2.predict_all(test)

In [None]:
final_predictions = (np.expm1(pred_y1)) + (np.expm1(pred_y2))

## 제출

In [None]:
submit = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\sample_submission.csv')

submit.head()

In [None]:
final_predictions = np.expm1(final_predictions_y1) + np.expm1(final_predictions_y2)


submit.iloc[:, 1] = final_predictions

submit

In [None]:
submit.to_csv('C:\\Users\\dlwks\\OneDrive\\바탕 화면\\VSCode\\HD_CNC\\1125-4.csv', index=False)