In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

df.drop(columns=["date"], inplace=True)

#to lowercase
df['staff_experience'] = df['staff_experience'].astype(str).str.lower().str.strip()
df['waste_category'] = df['waste_category'].astype(str).str.lower().str.strip()

# 이상치 처리
mode_temp = df['temperature_C'][(df['temperature_C'] > 0) & (df['temperature_C'] < 36)].mode()[0]
df.loc[(df['temperature_C'] <= 0) | (df['temperature_C'] >= 36), 'temperature_C'] = mode_temp

df = df[df['meals_served'] < 500]

# 수치형 결측치 처리
df['temperature_C'] = df['temperature_C'].fillna(mode_temp)
df['past_waste_kg'] = df['past_waste_kg'].fillna(df['past_waste_kg'].mean())

# experience 결측치 처리
df['staff_experience'] = df['staff_experience'].replace(['nan', 'None', 'NaN'], np.nan)
def fill_staff_experience(row):
    if pd.isnull(row['staff_experience']):
        staff = row['kitchen_staff']
        waste = row['past_waste_kg']

        if staff < 12 and waste > 20:
            return 'beginner'
        elif staff > 17 and waste < 15:
            return 'expert'
        else:
            return 'intermediate'
    return row['staff_experience']

df['staff_experience'] = df.apply(fill_staff_experience, axis=1)

# experience 수동 인코딩
experience_order = {
    'beginner': 0,
    'intermediate': 1,
    'expert': 2
}
df['staff_experience'] = df['staff_experience'].map(experience_order)

# waste_category Label 인코딩
label_encoders = {}
categorical_cols = ["waste_category"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# 요인 별 clustering
env_features = ['temperature_C', 'humidity_percent']
ops_features = ['meals_served', 'special_event', 'day_of_week']
human_features = ['kitchen_staff', 'staff_experience']

# 정규화

# 요인 내 정규화 및 요인 score 계산
def normalize_group(df, features):  
    scaler = MinMaxScaler()
    normed = scaler.fit_transform(df[features])
    return pd.DataFrame(normed, columns=features).mean(axis=1)


df['env_score'] = normalize_group(df, env_features)
df['ops_score'] = normalize_group(df, ops_features)
df['human_score'] = normalize_group(df, human_features)

# 요인 score 정규화
score_df = df[['env_score', 'ops_score', 'human_score']]
score_scaler = MinMaxScaler()
df[['env_score_norm', 'ops_score_norm', 'human_score_norm']] = score_scaler.fit_transform(score_df)

df = df.dropna(subset=[
    'env_score', 'ops_score', 'human_score',
    'env_score_norm', 'ops_score_norm', 'human_score_norm',
    'food_waste_kg'
])
df = df.copy()


cluster_input = df[['env_score_norm', 'ops_score_norm', 'human_score_norm']]

# KMeans Clustre
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(cluster_input)

cluster_mean = df.groupby('cluster')[['env_score_norm', 'ops_score_norm', 'human_score_norm']].mean()
print(cluster_mean)

# 클러스터별 회귀분석 (Gradient Boosting)
target = 'food_waste_kg'
predictors = env_features + ops_features + human_features

for cid in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cid]
    X = cluster_data[predictors]
    y = cluster_data[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # ✅ Gradient Boosting 모델 사용
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    r2 = r2_score(y_test, pred)

    # 특성 중요도 시각화용 데이터프레임
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    print(f"\nCluster {cid} → Test MSE: {mse:.2f}, n={len(X)}")
    print(f"Cluster {cid} → R²: {r2:.3f}")
    print(f"  - Feature Importances:\n{feature_importance.to_string(index=False)}")
    


         env_score_norm  ops_score_norm  human_score_norm
cluster                                                  
0              0.384877        0.398300          0.693982
1              0.713422        0.370555          0.485533
2              0.390668        0.324873          0.244258

 Cluster 0 → Coefficients:
Cluster 0 → Test MSE: 98.89, n=292
Cluster 0 → R^2: 0.192
  - Coefficients:
         feature  coefficient
   temperature_C     0.006267
humidity_percent     0.057107
    meals_served     0.053342
   special_event    10.165611
     day_of_week    -0.175978
   kitchen_staff     0.197498
staff_experience    -2.741289

 Cluster 1 → Coefficients:
Cluster 1 → Test MSE: 95.58, n=316
Cluster 1 → R^2: 0.099
  - Coefficients:
         feature  coefficient
   temperature_C     0.169593
humidity_percent     0.099928
    meals_served     0.053907
   special_event    14.132702
     day_of_week     0.211367
   kitchen_staff     0.252261
staff_experience    -1.443513

 Cluster 2 → Coeffici

In [None]:
# 색상 세트 (클러스터별)
colors = ['red', 'blue', 'green']

# 1. 환경 점수
plt.figure(figsize=(6, 4))
for cluster_id in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(cluster_data['env_score_norm'], cluster_data['food_waste_kg'],
                color=colors[cluster_id], alpha=0.6, label=f"Cluster {cluster_id}")
plt.title("Env Score vs Food Waste")
plt.xlabel("env_score_norm")
plt.ylabel("Food Waste (kg)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 2. 운영 점수
plt.figure(figsize=(6, 4))
for cluster_id in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(cluster_data['ops_score_norm'], cluster_data['food_waste_kg'],
                color=colors[cluster_id], alpha=0.6, label=f"Cluster {cluster_id}")
plt.title("Ops Score vs Food Waste")
plt.xlabel("ops_score_norm")
plt.ylabel("Food Waste (kg)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# 3. 인적 점수
plt.figure(figsize=(6, 4))
for cluster_id in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(cluster_data['human_score_norm'], cluster_data['food_waste_kg'],
                color=colors[cluster_id], alpha=0.6, label=f"Cluster {cluster_id}")
plt.title("Human Score vs Food Waste")
plt.xlabel("human_score_norm")
plt.ylabel("Food Waste (kg)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()