In [None]:
# !pip install statsmodels xgboost lightgbm


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
import numpy as np
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.linear_model import Lasso
warnings.filterwarnings('ignore')
plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

ad_df = pd.read_csv('광고.csv')
x= ad_df.iloc[:,:-1]
y=ad_df.iloc[:, -1]
ad_df.info()
print('\n결측치\n',ad_df.isna().sum())
ad_df.columns


In [None]:
import pandas as pd

# 타겟 변수명 지정
target = 'Product_Sold'

# 상관계수 계산할 컬럼 리스트
features = ['TV', 'Billboards', 'Google_Ads', 'Social_Media',
            'Influencer_Marketing', 'Affiliate_Marketing']

# 각 feature와 target 간의 상관계수 출력
for col in features:
    corr = ad_df[col].corr(ad_df[target])
    print(f"{col} ↔ {target} 상관계수: {corr:.4f}")


In [None]:
import matplotlib.pyplot as plt

correlations = [ad_df[col].corr(ad_df[target]) for col in features]

plt.figure(figsize=(10, 5))
bars = plt.bar(features, correlations, color='skyblue')
plt.axhline(0, color='gray', linestyle='--')
plt.title(f'Features vs {target} 상관계수', fontsize=15)
plt.ylabel('상관계수')
plt.xticks(rotation=15)

# 막대 위 수치 표시
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', 
             ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()


In [None]:

# 입력 변수(X)와 타깃 변수(y) 분리
X = ad_df.drop('Product_Sold', axis=1)
y = ad_df['Product_Sold']



In [None]:
# 원본 x는 그대로 유지
x_for_plot = x.copy()
x_for_plot["target"] = y.values

# pairplot은 복사본으로 그리기
sns.pairplot(x_for_plot, x_vars=x.columns, y_vars="target", height=4, aspect=0.8, kind='reg')
plt.suptitle("Feature vs Target Scatter Plots (Original Features)", y=1.02)
plt.show()


In [None]:

# 상관계수 계산 및 히트맵 시각화
feature_df = ad_df.iloc[:,:-1]

corr = feature_df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()




In [None]:
# 상관계수 행렬에서 0.8 이상이면서 자기자신 제외한 항목 추출
high_corr = corr.where((corr.abs() > 0.8) & (corr.abs() < 1.0)).dropna(how='all').dropna(axis=1, how='all')
print(high_corr)





In [None]:
# # feature , target 설정
# features = ad_df.columns[:-1]
# target = ad_df.columns[-1]

# plt.figure(figsize=(15, 10))
# for i, feature in enumerate(features):
#     plt.subplot(2, 3, i + 1)  # 2행 3열 
#     sns.scatterplot(data=ad_df, x=feature, y=target)
#     plt.title(f"{feature} vs {target}")

# plt.tight_layout()
# plt.show()





In [None]:
x_num_col_name = ad_df.columns[:-1]
print(x_num_col_name)


In [None]:
# 데이터 불러오기
# 마지막 컬럼(타겟값)을 제외한 컬럼 이름만 추출
x_num_col_name = ad_df.columns[:-1]


# 왜도 & 첨도 계산
for col in x_num_col_name:
    s = skew(ad_df[col])
    k = kurtosis(ad_df[col])
    print(f"[{col}] - 왜도: {s:.2f}, 첨도: {k:.2f}")





In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

for col in x_num_col_name:
    plt.figure()
    stats.probplot(ad_df[col], dist="norm", plot=plt)
    plt.title(f'QQ Plot of {col}')
    plt.show()

In [None]:
from sklearn.model_selection import learning_curve

def display_learning_curves(x, y, model, figname, cv=5, save_fig=True, shuffle=True):
    # scoring 고정
    scoring = 'r2'

    # 학습 곡선 계산
    train_sizes, train_scores, val_scores = learning_curve(
        estimator=model,
        X=x,
        y=y,
        cv=cv,
        scoring=scoring,
        error_score='raise',
        shuffle=shuffle
    )

    # 평균 계산
    train_mean = train_scores.mean(axis=1)
    val_mean   = val_scores.mean(axis=1)

    # 그래프 출력
    plt.figure()
    plt.plot(train_sizes, train_mean, label='Train R²')
    plt.plot(train_sizes, val_mean, label='Validation R²')
    plt.xlabel('Training Size')
    plt.ylabel('R² Score')
    plt.title('Learning Curve')
    plt.ylim(bottom=min(train_mean.min(), val_mean.min()) - 0.05, top=1.01)
    plt.legend()
    plt.grid(True)
    
	# 파일 저장
    if save_fig:
        plt.savefig(figname)
        print(f"📁 Saved: {figname}")
        
    # 결과 DataFrame 생성 및 반환
    result_df = pd.DataFrame({
        'Training Size': train_sizes,
        'Train R²': train_mean,
        'Validation R²': val_mean,
        'Gap': train_mean - val_mean
    })

    return result_df


#### default parameter & 회귀식


In [None]:

# ① 약어 매핑 딕셔너리
abbr = {
    'TV': 'tv',
    'Billboards': 'bb',
    'Google_Ads': 'ga',
    'Social_Media': 'sm',
    'Influencer_Marketing': 'im',
    'Affiliate_Marketing': 'am'
}

# ② 회귀식 생성 함수 (약어 적용)
def get_abbreviated_formula(title, model, abbr_dict):
    formula = f"{title} = {round(model.intercept_, 4)} + "
    coef_list = []
    for feature_name, coef in zip(model.feature_names_in_, model.coef_):
        short_name = abbr_dict.get(feature_name, feature_name)
        coef_list.append(f"{short_name} * {coef:.4f}")
    formula += " + ".join(coef_list)
    return formula

# ③ 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# ④ 모델 정의 및 결과 저장용 딕셔너리
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": xgb.XGBRegressor(random_state=42),
    "LightGBM": lgb.LGBMRegressor(random_state=42)
}

results = {}


# ⑤ 학습 및 평가
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

    # 선형 모델 회귀식 출력
    if name in ["Linear Regression", "Ridge", "Lasso"]:
        print(f"\n📌 {name} 회귀식:")
        print(get_abbreviated_formula("price", model, abbr))
    
    # 모든 모델 러닝커브 그리기
    display_learning_curves(X_train, y_train, model, figname=f"{name}-def-lc.png")
    


# ⑦ 결과 정리
results_df = pd.DataFrame(results).T
print("\n 모델 성능 비교 (RMSE 기준 정렬):")
print(results_df.sort_values(by="RMSE"))


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_split_model_metrics_bf(results):
    # 모델 분류
    linear_models = ['Linear Regression', 'Ridge', 'Lasso']
    nonlinear_models = ['Random Forest', 'XGBoost', 'LightGBM']

    def plot_metrics(models, title):
        mae_vals = [results[m]['MAE'] for m in models]
        rmse_vals = [results[m]['RMSE'] for m in models]
        r2_vals = [results[m]['R2'] for m in models]

        x = np.arange(len(models))
        width = 0.25

        plt.figure(figsize=(10, 5))

        bars1 = plt.bar(x - width, mae_vals, width=width, label='MAE', color='skyblue')
        bars2 = plt.bar(x, rmse_vals, width=width, label='RMSE', color='salmon')
        bars3 = plt.bar(x + width, r2_vals, width=width, label='R2 Score', color='mediumseagreen')

        # 막대 위에 값 표시 함수
        def add_labels(bars):
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width() / 2, height,
                         f'{height:.4f}', ha='center', va='bottom', fontsize=12)

        add_labels(bars1)
        add_labels(bars2)
        add_labels(bars3)



        plt.xticks(x, models, rotation=0,fontsize=15)
        plt.ylabel("값")
        plt.title(title)
        plt.legend()
        plt.tight_layout()

        plt.show()

    # 선형 모델 그래프
    plot_metrics(linear_models, "선형 모델 평가 지표 비교")

    # 비선형 모델 그래프
    plot_metrics(nonlinear_models, "비선형 모델 평가 지표 비교")



In [None]:
# 산점도 시각화 함수 정의
def get_scatter_bf(model_name, y_test, y_pred):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred, color='purple', alpha=0.5, label='예측값')
    plt.xlabel('실제값')
    plt.ylabel('예측값')
    plt.title(f'예측값 vs 실제값 ({model_name}):Default Hyperparameter')

    # 추세선 (y = x)
    x_range = np.linspace(min(y_test), max(y_test), 100)
    plt.plot(x_range, x_range, color='red', linewidth=2, label=f'{model_name} 모델 추세선')

    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()





In [None]:
import numpy as np
import matplotlib.pyplot as plt

def visualize_split_model_metrics_bf3(results):
    # 모델 분류
    linear_models = ['Linear Regression', 'Ridge', 'Lasso']
    nonlinear_models = ['Random Forest', 'XGBoost', 'LightGBM']

    def plot_metrics(models, title):
        mae_vals = [results[m]['MAE'] for m in models]
        rmse_vals = [results[m]['RMSE'] for m in models]
        r2_vals = [results[m]['R2'] for m in models]

        width = 1.0
        spacing = 1.0  # 모델 간 간격
        n_metrics = 3  # MAE, RMSE, R2

        # x 좌표 계산 (모델 하나당 3개의 막대: MAE, RMSE, R²)
        x = np.arange(len(models)) * (n_metrics + spacing)
        x_mae = x
        x_rmse = x + width
        x_r2 = x + 2 * width

        fig, ax1 = plt.subplots(figsize=(12, 6))

        # MAE & RMSE
        bars_mae = ax1.bar(x_mae, mae_vals, width=width, label='MAE', color='skyblue')
        bars_rmse = ax1.bar(x_rmse, rmse_vals, width=width, label='RMSE', color='salmon')
        ax1.set_ylabel('MAE, RMSE 값', fontsize=14)

        # R² (오른쪽 y축)
        ax2 = ax1.twinx()
        bars_r2 = ax2.bar(x_r2, r2_vals, width=width, label='R2', color='mediumseagreen')
        ax2.set_ylabel('R2 값', fontsize=14)

        # x축 눈금 위치 및 라벨
        xticks = x + width  # 모델 이름은 MAE, RMSE, R2 가운데에 배치
        ax1.set_xticks(xticks)
        ax1.set_xticklabels(models, fontsize=13)
        ax1.set_title(title, fontsize=16)
        ax1.grid(axis='y', linestyle='--', alpha=0.6)

        # 범례
        ax1.legend(loc='upper left', fontsize=11)
        ax2.legend(loc='upper right', fontsize=11)

        # 값 표시 함수
        def add_labels(ax, bars):
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2, height, f'{height:.3f}',
                        ha='center', va='bottom', fontsize=10)

        add_labels(ax1, bars_mae)
        add_labels(ax1, bars_rmse)
        add_labels(ax2, bars_r2)

        plt.tight_layout()
        plt.show()

    plot_metrics(linear_models, "튜닝 전 선형 모델 평가 지표 (MAE, RMSE, R2) 비교")
    plot_metrics(nonlinear_models, "튜닝 전 비선형 모델 평가 지표 (MAE, RMSE, R2) 비교")


In [None]:


visualize_split_model_metrics_bf3(results)

In [None]:
def plot_single_model_metrics(model_name, metrics):
    mae = metrics['MAE']
    rmse = metrics['RMSE']
    r2 = metrics['R2']

    labels = ['MAE', 'RMSE', 'R2']
    values_left = [mae, rmse]
    value_right = r2

    x = np.arange(len(values_left))  # [0, 1]

    fig, ax1 = plt.subplots(figsize=(6, 4))

    # MAE & RMSE
    bars = ax1.bar(x, values_left, width=0.4, color=['skyblue', 'salmon'], label='MAE & RMSE')
    ax1.set_ylabel('MAE, RMSE 값', fontsize=12)

    # R2 (오른쪽 y축)
    ax2 = ax1.twinx()
    bar_r2 = ax2.bar(2, value_right, width=0.4, color='mediumseagreen', label='R2')
    ax2.set_ylabel('R2 값', fontsize=12)

    # x축 눈금 위치 및 라벨
    ax1.set_xticks([0, 1, 2])
    ax1.set_xticklabels(labels, fontsize=12)
    ax1.set_title(f"{model_name} 모델 성능 지표", fontsize=14)
    ax1.grid(axis='y', linestyle='--', alpha=0.6)

    # 값 표시 함수
    def add_labels(ax, bars):
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2, height, f'{height:.3f}',
                    ha='center', va='bottom', fontsize=10)

    add_labels(ax1, bars)
    add_labels(ax2, bar_r2)

    # 범례
    ax1.legend(loc='upper left', fontsize=10)
    ax2.legend(loc='upper right', fontsize=10)

    plt.tight_layout()
    plt.show()


In [None]:
def visualize_individual_model_metrics(results):
    for model_name, metrics in results.items():
        plot_single_model_metrics(model_name, metrics)



In [None]:
visualize_individual_model_metrics(results)

In [None]:
# 각 모델의 예측값 산점도 시각화
for name, model in models.items():
    y_pred = model.predict(X_test)
    get_scatter_bf(name, y_test, y_pred)

In [None]:
import matplotlib.pyplot as plt

# 결과 정렬
results_sorted = results_df.sort_values(by="RMSE")

# 플롯 사이즈 지정
fig, ax = plt.subplots(figsize=(8, 4))  # 표 크기 조절 가능

# x, y 축 숨기기
ax.axis('off')

# 테이블 추가
table = ax.table(cellText=np.round(results_sorted.values,4),
                 colLabels=results_sorted.columns,
                 rowLabels=results_sorted.index,
                 cellLoc='center',
                 rowLoc='center',
                 loc='center')

table.scale(1, 1.5)  # 셀 크기 조절
table.auto_set_font_size(False)
table.set_fontsize(11)

# 셀 배경색 (헤더 강조)
for (row, col), cell in table.get_celld().items():
    if row == 0 or col == -1:
        cell.set_text_props(weight='bold')
    if row == 0:
        cell.set_facecolor('#40466e')  # 헤더 배경
        cell.set_text_props(color='w')  # 헤더 텍스트 색
    elif row % 2 == 0:
        cell.set_facecolor('#f1f1f2')  # 짝수행 배경
    else:
        cell.set_facecolor('#ffffff')  # 홀수행 배경

plt.title("Hyperparmeter 튜닝 전 모델별 평가 지표 (RMSE 기준 정렬)", fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# score = 1 - (0.2 * (MAE / IQR) + 0.2 * (RMSE / IQR) + 0.6 * (1 - R²))
def calculate_custom_score(mae, rmse, r2, iqr):
    """
    사용자 정의 회귀 성능 스코어 계산 함수.
    예측력을 강조하여 R²에 높은 가중치를 부여함.

    Parameters:
    - mae
    - rmse
    - r2
    - iqr

    Returns:
    - custom_score: 0~1 사이의 스코어, 1에 가까울수록 성능이 우수
    """

    # 정규화된 지표 계산
    mae_norm = mae / iqr
    rmse_norm = rmse / iqr
    r2_loss = 1 - r2  # r2는 클수록 좋으므로 '오차'처럼 다룸

    # 사용자 정의 스코어 (가중 평균)
    score = 1 - (0.2 * mae_norm + 0.2 * rmse_norm + 0.6 * r2_loss)

    return score

q75, q25 = np.percentile(y_test, [75, 25])
iqr = q75 - q25 if q75 != q25 else 1e-5  # 0 나누기 방지

# 사용자 정의 점수 계산
scores = {}
for model_name, metrics in results.items():
    score = calculate_custom_score(metrics["MAE"], metrics["RMSE"], metrics["R2"], iqr)
    scores[model_name] = score
    
# 막대그래프 시각화
models = list(scores.keys())
score_values = list(scores.values())

plt.figure(figsize=(10, 6))
bars = plt.bar(models, score_values, color='cornflowerblue')
plt.title("모델별 Customized Score", fontsize=14,pad=20)
plt.ylabel("Score", fontsize=12)
plt.ylim(0, 1)

# 막대 위에 수치 표시
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f'{yval:.4f}',
             ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=0, fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# RMSE 커스텀 스코어 함수
rmse_scorer = make_scorer(lambda y_true, y_pred: -np.sqrt(mean_squared_error(y_true, y_pred)))

In [None]:
param_grid_ridge = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg']
}

grid_ridge = GridSearchCV(Ridge(), param_grid_ridge, scoring=rmse_scorer, cv=5)
grid_ridge.fit(X_train, y_train)

print(" Ridge Best Params:", grid_ridge.best_params_)
print(" Best RMSE:", -grid_ridge.best_score_)


In [None]:
param_grid_lasso = {
    'alpha': [0.1, 1.0],
    'fit_intercept': [True, False],
    'max_iter': [1000, 5000, 10000]
}

grid_lasso = GridSearchCV(Lasso(), param_grid_lasso, scoring=rmse_scorer, cv=5)
grid_lasso.fit(X_train, y_train)

print(" Lasso Best Params:", grid_lasso.best_params_)
print(" Best RMSE:", -grid_lasso.best_score_)

In [None]:


param_grid_rf = {
    'n_estimators': [50, 100, 200,300,800,1000,1100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, scoring=rmse_scorer, cv=5)
grid_rf.fit(X_train, y_train)

print(" Random Forest Best Params:", grid_rf.best_params_)
print(" Best RMSE:", -grid_rf.best_score_)


In [None]:
param_grid_xgb = {
    'n_estimators': [50, 100,200,300,500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]

    
    
}

grid_xgb = GridSearchCV(
    xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
    param_grid_xgb, scoring=rmse_scorer, cv=5
)
grid_xgb.fit(X_train, y_train)

print(" XGBoost Best Params:", grid_xgb.best_params_)
print(" Best RMSE:", -grid_xgb.best_score_)





In [None]:
param_grid_lgb = {
    'n_estimators': [50, 100,500,1000],
    'num_leaves': [15,31, 40,70],
    'learning_rate': [0.01, 0.1],
    'max_depth': [-1, 5, 10,15],
    'min_child_samples': [5, 10, 20]
}

grid_lgb = GridSearchCV(
    lgb.LGBMRegressor(random_state=42),
    param_grid_lgb, scoring=rmse_scorer, cv=5
)
grid_lgb.fit(X_train, y_train)

print(" LightGBM Best Params:", grid_lgb.best_params_)
print(" Best RMSE:", -grid_lgb.best_score_)




In [None]:
from lightgbm import LGBMRegressor

hmodels = {

    "Ridge": Ridge(alpha=100.0, 
                   fit_intercept=False,solver='lsqr'),
    "Lasso": Lasso(alpha=0.001, 
                   fit_intercept=False,
                   max_iter=500),
    "RandomForest": RandomForestRegressor(max_depth=10,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          n_estimators=1100,
                                          random_state=42),
    "XGBoost":xgb.XGBRegressor(objective='reg:squarederror',
                               learning_rate=0.1,max_depth=3, 
                               n_estimators=1000,
                               sub_sample=0.8, 
                               random_state=42),
    "LightGBM": LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        min_split_gain=0.0,
        min_child_samples=1,
        min_child_weight=1e-6,
        max_depth=5,
        num_leaves=15,
        random_state=42
    )
}


In [None]:
results_dict = {}
def get_mse_r2score_all(model, y_test, y_pred,model_name):
    if hasattr(model, 'coef_'):
        print("계수 (coef_):\n", model.coef_)
        print("절편 (intercept):", model.intercept_)
    else:
        print("이 모델은 계수(coef_)와 절편(intercept_)을 제공하지 않습니다.")

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print("평균절대오차 (MAE):", mae)
    print("평균제곱근오차 (RMSE):", rmse)
    print("결정계수 (R² Score):", r2)

    results_dict[model_name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}
    return mae, rmse, r2


In [None]:
# 약어 딕셔너리 (이미 정의되어 있다고 가정)
abbr = {
    'TV': 'tv',
    'Billboards': 'bb',
    'Google_Ads': 'ga',
    'Social_Media': 'sm',
    'Influencer_Marketing': 'im',
    'Affiliate_Marketing': 'am'
}

# 회귀식 생성 함수
def get_abbreviated_formula(title, model, abbr_dict):
    formula = f"{title} = {round(model.intercept_, 4)} + "
    coef_list = []
    for feature_name, coef in zip(model.feature_names_in_, model.coef_):
        short_name = abbr_dict.get(feature_name, feature_name)
        coef_list.append(f"{short_name} * {coef:.4f}")
    formula += " + ".join(coef_list)
    return formula


# 평가 지표 저장용 딕셔너리
results_dict = {}

# 평가 및 회귀식 출력
for name, model in hmodels.items():
    print(f"\n▶ {name} 결과:")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # 평가지표 계산
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    # 평가지표 출력
    print("MAE:", mae)
    print("RMSE:", rmse)
    print("R²:", r2)

    # 결과 저장
    results_dict[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

    # 회귀식 출력 (선형 모델만)
    if name in ["LinearRegression", "Ridge", "Lasso"]:
        print(f"\n{name} 회귀식:")
        print(get_abbreviated_formula("price", model, abbr))
        
        # 모든 모델 러닝커브 그리기
    display_learning_curves(X_train, y_train, model, figname=f"{name}_opt_lc.png")


In [None]:
for name, model in hmodels.items():
    print(f"\n▶ {name} 결과:")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    get_mse_r2score_all(model, y_test, y_pred,name)


In [None]:
import matplotlib.pyplot as plt

# RMSE 기준 정렬
results_sorted = results_df.sort_values(by="RMSE")

# 테이블 시각화
fig, ax = plt.subplots(figsize=(8,4))  # 전체 도표 크기

# 축 숨기기
ax.axis('off')

# 테이블 만들기
table = ax.table(
    cellText=np.round(results_sorted.values,4),  # 소수점 2자리 반올림
    colLabels=results_sorted.columns,
    rowLabels=results_sorted.index,
    cellLoc='center',
    rowLoc='center',
    loc='center'
)

# 셀 크기 및 폰트 조정
table.scale(1, 1.5)
table.auto_set_font_size(False)
table.set_fontsize(11)

# 셀 배경색 (헤더 강조)
for (row, col), cell in table.get_celld().items():
    if row == 0 or col == -1:
        cell.set_text_props(weight='bold')
    if row == 0:
        cell.set_facecolor('#40466e')  # 헤더 배경
        cell.set_text_props(color='w')  # 헤더 텍스트 색
    elif row % 2 == 0:
        cell.set_facecolor('#f1f1f2')  # 짝수행 배경
    else:
        cell.set_facecolor('#ffffff')  # 홀수행 배경

plt.title("Hyperparmeter 튜닝 후 모델별 평가 지표 (RMSE 기준 정렬)", fontsize=14, pad=20)
plt.tight_layout()
plt.show()


In [None]:
# score = 1 - (0.2 * (MAE / IQR) + 0.2 * (RMSE / IQR) + 0.6 * (1 - R²))
def calculate_custom_score(mae, rmse, r2, iqr):
    """
    사용자 정의 회귀 성능 스코어 계산 함수.
    예측력을 강조하여 R²에 높은 가중치를 부여함.

    Parameters:
    - mae
    - rmse
    - r2
    - iqr

    Returns:
    - custom_score: 0~1 사이의 스코어, 1에 가까울수록 성능이 우수
    """  

    # 정규화된 지표 계산
    mae_norm = mae / iqr
    rmse_norm = rmse / iqr
    r2_loss = 1 - r2  # r2는 클수록 좋으므로 '오차'처럼 다룸

    # 사용자 정의 스코어 (가중 평균)
    score = 1 - (0.2 * mae_norm + 0.2 * rmse_norm + 0.6 * r2_loss)

    return score

q75, q25 = np.percentile(y_test, [75, 25])
iqr = q75 - q25 if q75 != q25 else 1e-5  # 0 나누기 방지

# 사용자 정의 점수 계산
scores = {}
for model_name, metrics in results_dict.items():
    score = calculate_custom_score(metrics["MAE"], metrics["RMSE"], metrics["R2"], iqr)
    scores[model_name] = score
    
# 막대그래프 시각화
model_names = list(scores.keys())
score_values = list(scores.values())

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, score_values, color='cornflowerblue')
plt.title("모델별 Customized Score", fontsize=14,pad=20)
plt.ylabel("Score", fontsize=12)
plt.ylim(0, 1)

# 막대 위에 수치 표시
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f'{yval:.4f}',
             ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=0, fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
def get_scatter(model, y_test, y_pred):
   plt.figure(figsize=(6, 6))
   plt.scatter(y_test, y_pred, color='purple', alpha=0.5, label='예측값')
   plt.xlabel('실제값')
   plt.ylabel('예측값')
   plt.title(f'예측값 vs 실제값 ({model} 모델): Optimized Hyperparameter')

   x_range = np.linspace(min(y_test), max(y_test), 100)
   plt.plot(x_range, x_range, linewidth=2, label=f'{model} 모델 추세선')

   plt.legend()
   plt.grid(True)
   plt.tight_layout()
   plt.show()

In [None]:
for name, model in hmodels.items():
    y_pred = model.predict(X_test)
    get_scatter(name, y_test, y_pred)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_split_model_metrics_bf4(results_dict):
    # 모델 분류
    linear_models = ['LinearRegression', 'Ridge', 'Lasso']
    nonlinear_models = ['RandomForest', 'XGBoost', 'LightGBM']

    def plot_metrics(hmodels, title):
        mae_vals = [results_dict[m]['MAE'] for m in hmodels]
        rmse_vals = [results_dict[m]['RMSE'] for m in hmodels]
        r2_vals = [results_dict[m]['R2'] for m in hmodels]

        width = 1.0
        spacing = 1.0  # 모델 간 간격
        n_metrics = 3  # MAE, RMSE, R2

        # x 좌표 계산 (모델 하나당 3개의 막대: MAE, RMSE, R²)
        x = np.arange(len(hmodels)) * (n_metrics + spacing)
        x_mae = x
        x_rmse = x + width
        x_r2 = x + 2 * width

        fig, ax1 = plt.subplots(figsize=(12, 6))

        # MAE & RMSE
        bars_mae = ax1.bar(x_mae, mae_vals, width=width, label='MAE', color='skyblue')
        bars_rmse = ax1.bar(x_rmse, rmse_vals, width=width, label='RMSE', color='salmon')
        ax1.set_ylabel('MAE, RMSE 값', fontsize=14)

        # R² (오른쪽 y축)
        ax2 = ax1.twinx()
        bars_r2 = ax2.bar(x_r2, r2_vals, width=width, label='R2', color='mediumseagreen')
        ax2.set_ylabel('R2 값', fontsize=14)

        # x축 눈금 위치 및 라벨
        xticks = x + width  # 모델 이름은 MAE, RMSE, R2 가운데에 배치
        ax1.set_xticks(xticks)
        ax1.set_xticklabels(hmodels, fontsize=13)
        ax1.set_title(title, fontsize=16)
        ax1.grid(axis='y', linestyle='--', alpha=0.6)

        # 범례
        ax1.legend(loc='upper left', fontsize=11)
        ax2.legend(loc='upper right', fontsize=11)

        # 값 표시 함수
        def add_labels(ax, bars):
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2, height, f'{height:.3f}',
                        ha='center', va='bottom', fontsize=10)

        add_labels(ax1, bars_mae)
        add_labels(ax1, bars_rmse)
        add_labels(ax2, bars_r2)

        plt.tight_layout()
        plt.show()

    plot_metrics(linear_models, "튜닝 후 선형 모델 평가 지표 (MAE, RMSE, R2) 비교")
    plot_metrics(nonlinear_models, "튜닝 후 비선형 모델 평가 지표 (MAE, RMSE, R2) 비교")
    

visualize_split_model_metrics_bf4(results_dict)


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def visualize_split_model_metrics_per_model(results_dict):
    for model_name, metrics in results_dict.items():
        mae = metrics['MAE']
        rmse = metrics['RMSE']
        r2 = metrics['R2']

        # 막대 위치
        x = np.array([0, 1, 2])
        labels = ['MAE', 'RMSE', 'R2']
        values1 = [mae, rmse, 0]  # R² 제외
        values2 = [0, 0, r2]      # R²만 오른쪽 y축

        fig, ax1 = plt.subplots(figsize=(6, 5))

        # 왼쪽 y축: MAE, RMSE
        bars1 = ax1.bar(x[:2], values1[:2], color=['skyblue', 'salmon'], width=0.4)
        ax1.set_ylabel('MAE / RMSE', fontsize=13)
        ax1.set_ylim(0, max(mae, rmse) * 1.2)

        # 오른쪽 y축: R²
        ax2 = ax1.twinx()
        bar2 = ax2.bar(x[2], values2[2], color='mediumseagreen', width=0.4)
        ax2.set_ylabel('R²', fontsize=13)
        ax2.set_ylim(0, 1.0)

        # x축 설정
        ax1.set_xticks(x)
        ax1.set_xticklabels(labels, fontsize=12)

        # 제목
        ax1.set_title(f"{model_name} 성능 지표", fontsize=15)

        # 값 표시 함수
        def add_labels(ax, bars):
            for bar in bars:
                height = bar.get_height()
                if height > 0:
                    ax.text(bar.get_x() + bar.get_width()/2, height, f'{height:.3f}',
                            ha='center', va='bottom', fontsize=10)

        add_labels(ax1, bars1)
        add_labels(ax2, bar2)

        plt.tight_layout()
        plt.show()


In [None]:
visualize_split_model_metrics_per_model(results_dict)
