## Env

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor

In [None]:
matplotlib.rcParams['font.family'] = 'Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\train.csv')
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\HD_CNC\test.csv')

display(train.head())
display(test.head())

## Preprocessing

In [None]:
train = train.rename(columns = {
    'X1' : '작업번호',
    'X2' : '마킹길이',
    'X3' : '절단길이',
    'X4' : '철판두께',
    'X5' : '철판재질',
    'X6' : '절단갯수',
    'X7' : '작업장',
    'X8' : '작업자',
    'Y1' : '마킹시간',
    'Y2' : '절단시간'
})

test = test.rename(columns = {
    'X1' : '작업번호',
    'X2' : '마킹길이',
    'X3' : '절단길이',
    'X4' : '철판두께',
    'X5' : '철판재질',
    'X6' : '절단갯수',
    'X7' : '작업장',
    'X8' : '작업자'
})

In [None]:
train['철판재질'] = pd.factorize(train['철판재질'])[0]
train['작업자'] = pd.factorize(train['작업자'])[0]

test['철판재질'] = pd.factorize(test['철판재질'])[0]
test['작업자'] = pd.factorize(test['작업자'])[0]

In [None]:
display(train.info())
display(test.info())

In [None]:
display(train)

test.drop(columns = ['Id'], inplace = True)
display(test)

In [None]:
display(train.describe())
display(test.describe())

## EDA

In [None]:
sns.heatmap(train.corr(), vmax = 1, vmin = -1, annot = True)

In [None]:
# sns.pairplot(train)

In [None]:
features = train.columns
numeric_features = train.select_dtypes('number').columns
categorical_features = train.select_dtypes('object').columns
for i in numeric_features:
  plt.figure(figsize = (20, 20))
  plt.tight_layout()
  f, (ax_box, ax_hist) = plt.subplots(2, sharex = True)
  sns.boxplot(train[i], ax = ax_box, linewidth = 0.5)
  sns.histplot(train[i], ax = ax_hist, bins = 10, kde = True)

In [None]:
display(sns.displot(train['마킹길이']))
display(sns.displot(train['절단길이']))
display(sns.displot(train['절단갯수']))
display(sns.displot(train['작업장']))
display(sns.displot(train['절단시간']))

In [None]:
train['절단길이'] = np.log1p(train['절단길이'])
display(train['절단길이'].skew())

train['마킹길이'] = pd.Series(stats.boxcox(train['마킹길이'])[0])
display(train['마킹길이'].skew())

In [None]:
plt.hist(train['절단길이'])

In [None]:
plt.hist(train['마킹길이'])

In [None]:
f = sns.scatterplot(x = '절단길이', y = '절단시간', hue = '마킹길이', data = train)

In [None]:
f = sns.scatterplot(x = '절단길이', y = '마킹시간', hue = '마킹길이', data = train)

In [None]:
def remove_outliers(df, col_name):
    uppper_bound = df[col_name].mean() + 5 * df[col_name].std()
    lower_bound = df[col_name].mean() - 5 * df[col_name].std()

    return df[(df[col_name] < uppper_bound) & (df[col_name] > lower_bound)]

columns_to_process = ['마킹길이', '절단길이', '철판두께', '철판재질', '절단갯수', '작업장', '작업자', '마킹시간', '절단시간']

for col in columns_to_process:
    train = remove_outliers(train, col)

In [None]:
f = sns.scatterplot(x = '절단길이', y = '절단시간', hue = '마킹길이', data = train)

In [None]:
f = sns.scatterplot(x = '절단길이', y = '마킹시간', hue = '마킹길이', data = train)

In [None]:
common_columns = ['작업번호', '마킹길이', '절단길이', '철판두께', '철판재질', '절단갯수', '작업장', '작업자']

train_common = train[common_columns]
test_common = test[common_columns]

In [None]:
scaler = StandardScaler()

train_scaled = scaler.fit_transform(train_common)
test_scaled = scaler.transform(test_common)

In [33]:
# X = train_scaled[['마킹길이', '절단길이', '철판두께', '철판재질', '절단갯수', '작업장', '작업자', '마킹시간', '절단시간']]
# y = train_scaled[['작업자', '마킹시간', '절단시간']]

X = train_scaled[:, [1, 2, 3, 4, 5, 6, 7]]
y = train_scaled[:, 7]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [34]:
k = 5

kf = KFold(n_splits = k, shuffle = True, random_state = 42)

In [35]:
model = RandomForestRegressor()

In [36]:
mse_scores = []

for train_idx, val_idx, in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    model.fit(X_train_fold, y_train_fold)

    y_pred = model.predict(X_val_fold)

    mse = mean_squared_error(y_val_fold, y_pred)
    mse_scores.append(mse)

print(f'평균 MSE : {np.mean(mse_scores)}')

평균 MSE : 2.404492649017335e-28


In [37]:
model.fit(X_train, y_train)

In [38]:
pred = model.predict(X_test)

In [39]:
mse = mean_squared_error(y_test, pred)
print(f'MSE : {mse}')

MSE : 3.932516313351932e-28
