## Env

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
import xgboost as xgb
import bisect

from scipy import stats
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor

| 항목 | 설명     |
|-----|----------|
| X1  | 작업번호 |
| X2  | 마킹길이 |
| X3  | 절단길이 |
| X4  | 철판두께 |
| X5  | 철판재질 |
| X6  | 절단갯수 |
| X7  | 작업장   |
| X8  | 작업자   |
| Y1  | 마킹시간 |
| Y2  | 절단시간 |


In [None]:
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\train.csv')
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\test.csv')

display(train.head())
display(test.head())

## Preprocessing & EDA

In [None]:
categorical_features = ['X5', 'X8']
encoders = {}

for feature in tqdm(categorical_features, desc="Encoding features"):
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature].astype(str))
    le_classes_set = set(le.classes_)
    test[feature] = test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    test[feature] = le.transform(test[feature].astype(str))
    encoders[feature] = le

In [None]:
display(train.info())
display(test.info())

In [None]:
train.head()

In [None]:
display(train)

test.drop(columns = ['Id'], inplace = True)
display(test)

In [None]:
display(train.describe())
display(test.describe())

In [None]:
ms.matrix(train)
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(train.corr(), vmax = 1, vmin = -1, annot = True)

In [None]:
# sns.pairplot(train)

In [None]:
features = train.columns
numeric_features = train.select_dtypes('number').columns
categorical_features = train.select_dtypes('object').columns
for i in numeric_features:
  plt.figure(figsize = (15, 5))
  plt.tight_layout()
  f, (ax_box, ax_hist) = plt.subplots(2, sharex = True)
  sns.boxplot(train[i], ax = ax_box, linewidth = 0.5)
  sns.histplot(train[i], ax = ax_hist, bins = 10, kde = True)

In [None]:
cols_numeric = train.describe().columns

print(f'numeric cols : {len(cols_numeric)}')
print(f'{cols_numeric}')

fig, ax = plt.subplots(4, 3, figsize=(15, 5))

for i in range(len(cols_numeric)):

    x = i // 3    
    y = i % 3
    
    sns.histplot(x=cols_numeric[i], data = train, bins = 75, ax = ax[x,y])
    ax[x, y].set_title(cols_numeric[i])

plt.suptitle('Numerical Variance distribtion\n\n')
plt.tight_layout()
plt.show()

In [None]:
drop_indices = train[(train['Y1'] >= 2500) |
                     (train['Y2'] >= 20000) |
                     (train['X2'] >= 500) |
                     (train['X4'] > 40) |
                     (train['X6'] >= 265)].index

train = train.drop(drop_indices)

In [None]:
train = train[(train['Y1'] > 150) & (train['Y1'] <= 67000) & (train['Y2'] > 150) & (train['Y2'] <= 67000)]

In [None]:
data = train.groupby('X7').mean().reset_index()

fig, axes = plt.subplots(3, 3, figsize=(15, 5))

sns.barplot(x = 'X7', y = 'X1', data = data, ax = axes[0, 0])
axes[0, 0].set_title('X1')

sns.barplot(x = 'X7', y = 'X2', data = data, ax = axes[0, 1])
axes[0, 1].set_title('X2')

sns.barplot(x = 'X7', y = 'X3', data = data, ax = axes[0, 2])
axes[0, 2].set_title('X3')

sns.barplot(x = 'X7', y = 'X4', data = data, ax = axes[1, 0])
axes[1, 0].set_title('X4')

sns.barplot(x = 'X7', y = 'X5', data = data, ax = axes[1, 1])
axes[1, 1].set_title('X5')

sns.barplot(x = 'X7', y = 'X6', data = data, ax = axes[1, 2])
axes[1, 2].set_title('X6')

sns.barplot(x = 'X7', y = 'X8', data= data, ax= axes[2, 0])
axes[2, 0].set_title('X8')

sns.barplot(x = 'X7', y = 'Y1', data = data, ax = axes[2, 1])
axes[2, 1].set_title('Y1')

sns.barplot(x = 'X7', y = 'Y2', data = data, ax = axes[2, 2])
axes[2, 2].set_title('Y2')

plt.tight_layout()
plt.show()

In [None]:
mmtime = train.groupby('X5')['Y1'].mean()
mctime = train.groupby('X5')['Y2'].mean()

wmtime = train.groupby('X7')['Y1'].mean()
wctime = train.groupby('X7')['Y2'].mean()

manmtime = train.groupby('X8')['Y1'].mean()
manctime = train.groupby('X8')['Y2'].mean()

train['X5_mmtime'] = train['X5'].map(mmtime)
train['X5_mctime'] = train['X5'].map(mctime)

train['X7_wmtime'] = train['X7'].map(wmtime)
train['X7_wctime'] = train['X7'].map(wctime)

train['X8_manmtime'] = train['X8'].map(manmtime)
train['X8_manctime'] = train['X8'].map(manctime)

test['X5_mmtime'] = test['X5'].map(mmtime)
test['X5_mctime'] = test['X5'].map(mctime)

test['X7_wmtime'] = test['X7'].map(wmtime)
test['X7_wctime'] = test['X7'].map(wctime)

test['X8_manmtime'] = test['X8'].map(manmtime)
test['X8_manctime'] = test['X8'].map(manctime)

In [None]:
train['Velo_cutting'] = train['X3'] / train['X4']
train['Velo_marking'] = train['X2'] / train['X4']

test['Velo_cutting'] = test['X3'] / test['X4']
test['Velo_marking'] = test['X2'] / test['X4']

train['Feed_cutting'] = train['Velo_cutting'] * train['X6']
train['Feed_marking'] = train['Velo_marking'] * train['X6']

test['Feed_cutting'] = test['Velo_cutting'] * test['X6']
test['Feed_marking'] = test['Velo_marking'] * test['X6']

In [None]:
sns.scatterplot(x = train['X2'], y = train['X3'], hue = train['X6'].astype('category'), legend = None)

In [None]:
bins = [0, 15, 40, 70, 95, 110, float('inf')]

labels = [0, 1, 2, 3, 4, 5]

train['X6_category'] = pd.cut(train['X6'], bins=bins, labels=labels, include_lowest=True)
test['X6_category'] = pd.cut(test['X6'], bins=bins, labels=labels, include_lowest=True)

train.drop(columns=['X6'], inplace=True)
test.drop(columns=['X6'], inplace=True)

sns.scatterplot(x=train['X2'], y=train['X3'], hue=train['X6_category'].astype('category'))

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x = 'X3', y = 'Y2', hue = 'X2', data = train)

plt.subplot(1, 3, 2)
sns.scatterplot(x = 'X3', y = 'Y1', hue = 'X2', data = train)

plt.subplot(1, 3, 3)
sns.scatterplot(x = 'X2', y = 'Y1', hue = 'X3', data = train)

plt.tight_layout()
plt.show()

In [None]:
# def remove_outliers(df, col_name):
#     uppper_bound = df[col_name].mean() + 3 * df[col_name].std()
#     lower_bound = df[col_name].mean() - 3 * df[col_name].std()

#     print(f"평균: {df[col_name].mean()}, 표준편차: {df[col_name].std()}")

#     return df[(df[col_name] < uppper_bound) & (df[col_name] > lower_bound)]

# columns_to_process = ['X2', 'X3', 'X4']

# for col in columns_to_process:
#     train = remove_outliers(train, col)

In [None]:
plt.figure(figsize = (10, 5))
plt.subplot(1, 3, 1)
sns.scatterplot(x = 'X3', y = 'Y2', hue = 'X2', data = train)

plt.subplot(1, 3, 2)
sns.scatterplot(x = 'X3', y = 'Y1', hue = 'X2', data = train)

plt.subplot(1, 3, 3)
sns.scatterplot(x = 'X2', y = 'Y1', hue = 'X3', data = train)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (12, 6))

plt.subplot(1, 3, 1)
train['Y1'].hist(bins = 100)
plt.title('Original Data')

plt.subplot(1, 3, 2)
np.log1p(train['Y1']).hist(bins = 100)
plt.title('Log Transform Data')

plt.subplot(1, 3, 3)
transformed_data, _ = stats.boxcox(train['Y1'] + 1)
plt.hist(transformed_data, bins = 100)
plt.title('Box-Cox Transform Data')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (12, 6))

plt.subplot(1, 3, 1)
train['Y2'].hist(bins = 100)
plt.title('Original Data')

plt.subplot(1, 3, 2)
np.log1p(train['Y2']).hist(bins = 100)
plt.title('Log Transform Data')

plt.subplot(1, 3, 3)
transformed_data, _ = stats.boxcox(train['Y2'] + 1)
plt.hist(transformed_data, bins = 100)
plt.title('Box-Cox Transform Data')

plt.tight_layout()
plt.show()

In [None]:
train_data1 = train[['X1', 'X2', 'X3', 'X4', 'X5', 'X6_category', 'X7', 'X8', 'X5_mmtime', 'X5_mctime', 'X7_wmtime', 'X7_wctime', 'X8_manmtime', 'X8_manctime', 'Velo_cutting' ,'Velo_marking', 'Feed_cutting', 'Feed_marking']]

train_y1 = np.log1p(train['Y1'])
train_y2 = np.log1p(train['Y2'])

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(train.corr(), vmax = 1, vmin = -1, annot = True)

In [None]:
def train_and_evaluate(model, model_name, X_train, y_train):
    print(f'Model Tune for {model_name}.')
    model.fit(X_train, y_train)
    
    feature_importances = model.feature_importances_
    sorted_idx = feature_importances.argsort()

    plt.figure(figsize=(10, len(X_train.columns)))
    plt.title(f"Feature Importances ({model_name})")
    plt.barh(range(X_train.shape[1]), feature_importances[sorted_idx], align='center')
    plt.yticks(range(X_train.shape[1]), X_train.columns[sorted_idx])
    plt.xlabel('Importance')
    plt.show()
    
    return model, feature_importances

X_train = train.drop(columns = ['X6_category','Y1', 'Y2'])
Y_train = train[['Y1', 'Y2']]

# Model Tune for XGB
xgb_model, xgb_feature_importances = train_and_evaluate(xgb.XGBRegressor(), 'XGB', X_train, Y_train)

In [None]:
# threshold = 0.05
# low_importance_features = X_train.columns[xgb_feature_importances < threshold]

# train_data1 = train_data1.drop(columns = low_importance_features)
# test = test.drop(columns = low_importance_features)

In [None]:
numeric_cols = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6_category', 'X7', 'X8', 'X5_mmtime', 'X5_mctime', 'X7_wmtime', 'X7_wctime', 'X8_manmtime', 'X8_manctime', 'Velo_cutting' ,'Velo_marking', 'Feed_cutting', 'Feed_marking']

#'X1', 'X2', 'X7', 'X5_mmtime', 'X5_mctime', 'X8_manmtime', 'X8_manctime' 삭제

scaler = StandardScaler()

train_data1[numeric_cols] = scaler.fit_transform(train_data1[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])

In [None]:
xgb_y1 = xgb.XGBRegressor(eval_metric='mae',
                          enable_categorical=True 
                          )

X_train_reduced = train_data1.reset_index(drop=True)
y_train = train_y1.reset_index(drop=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

ensemble_predictions = []
scores = []

y_train = y_train.values

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]

    xgb_y1.fit(X_t, y_t, eval_set=[(X_val, y_val)], verbose=50)

    val_pred = xgb_y1.predict(X_val)

    scores.append(mean_absolute_error(y_val, val_pred))

    test = test[train_data1.columns]

    xgb_pred = xgb_y1.predict(test)
    xgb_pred = np.where(xgb_pred < 0, 0, xgb_pred)

    ensemble_predictions.append(xgb_pred)

final_predictions_y1 = np.mean(ensemble_predictions, axis=0)

print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

In [None]:
xgb_y2 = xgb.XGBRegressor(eval_metric='mae',
                          enable_categorical=True 
                          )

X_train_reduced = train_data1.reset_index(drop=True)
y_train = train_y2.reset_index(drop=True)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

ensemble_predictions = []
scores = []

y_train = y_train.values

for train_idx, val_idx in tqdm(kf.split(X_train_reduced), total=5, desc="Processing folds"):
    X_t, X_val = X_train_reduced.iloc[train_idx], X_train_reduced.iloc[val_idx]
    y_t, y_val = y_train[train_idx], y_train[val_idx]

    xgb_y2.fit(X_t, y_t, eval_set=[(X_val, y_val)], verbose=50)

    val_pred = xgb_y2.predict(X_val)

    scores.append(mean_absolute_error(y_val, val_pred))

    xgb_pred = xgb_y2.predict(test)
    xgb_pred = np.where(xgb_pred < 0, 0, xgb_pred)

    ensemble_predictions.append(xgb_pred)

final_predictions_y2 = np.mean(ensemble_predictions, axis=0)

print("Validation : MAE scores for each fold:", scores)
print("Validation : MAE:", np.mean(scores))

In [None]:
submit = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\sample_submission.csv')

submit.head()

In [None]:
final_predictions = (np.exp(final_predictions_y1) - 1) + (np.exp(final_predictions_y2) - 1)


submit.iloc[:, 1] = final_predictions

submit.to_csv('C:\\Users\\dlwks\\OneDrive\\바탕 화면\\VSCode\\HD_CNC\\1006_8.csv', index=False)

In [None]:
submit