In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["axes.unicode_minus"] = False

df = pd.read_csv("./data/dataset.csv")

# EDA


In [None]:
plt.figure(figsize=(15, 20))
sns.heatmap(df.iloc[:, 1:-1].corr(), annot=True)

In [None]:
def remove_outliers(df, factor=1.5):
    df_filtered = df.copy()  # 원본 데이터 보호를 위한 복사본 생성
    for col in df_filtered.select_dtypes(
        include="float"
    ).columns:  # 숫자형 컬럼에 대해서만 적용
        Q1 = df_filtered[col].quantile(0.25)  # 1사분위수
        Q3 = df_filtered[col].quantile(0.75)  # 3사분위수
        IQR = Q3 - Q1  # IQR 계산
        lower_bound = Q1 - factor * IQR  # 하한 기준 (1.5 * IQR)
        upper_bound = Q3 + factor * IQR  # 상한 기준 (1.5 * IQR)
        # 하한과 상한을 벗어난 값들을 제거
        df_filtered = df_filtered[
            (df_filtered[col] >= lower_bound) & (df_filtered[col] <= upper_bound)
        ]
    return df_filtered


# 이상치 제거 후 데이터프레임
df_clean = remove_outliers(df.iloc[:, -5:])

In [None]:
tmp = df_clean.sample(frac=0.1)
x_vars = tmp.columns[:-1]
y_vars = tmp.columns[:-1]
n_rows = len(x_vars)
n_cols = len(y_vars)

# Set up the matplotlib figure and axes
fig, axes = plt.subplots(
    n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows), squeeze=False
)

# Loop through each subplot (grid defined by x_vars vs y_vars)
for i, x in enumerate(x_vars):
    for j, y in enumerate(y_vars):
        ax = axes[i, j]
        # Scatter plot with hue by "차종번호"
        sns.scatterplot(
            data=tmp,
            x=x,
            y=y,
            hue="passorfail",
            ax=ax,
            palette="deep",
            alpha=0.6,
            edgecolor="w",
            s=50,
        )
        # Overlay KDE plot to display density contours
        # Adjust levels and linewidths as needed.
        sns.kdeplot(
            data=tmp,
            x=x,
            y=y,
            ax=ax,
            color="red",
            levels=5,
            linewidths=1.5,
            alpha=0.7,
        )

        ax.set_title(f"{x} vs {y}")

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()