In [20]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
batch_regression_csv_only.py
- out_by_recipe/by_recipe_code_*.csv 대상으로 별점 회귀 평가
- .ipynb 등 비CSV는 무시
- 텍스트(TF-IDF) + 수치 + 범주 + 시간 파생 결합
- RMSE = sqrt(MSE)로 직접 계산(버전 호환)
"""

import glob
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer

def make_time_features(s):
    dt = pd.to_datetime(s, unit="s", errors="coerce")
    return pd.DataFrame({
        "year": dt.dt.year,
        "month": dt.dt.month,
        "dayofweek": dt.dt.dayofweek,
        "hour": dt.dt.hour
    })

def evaluate_file(csv_path, target="stars"):
    try:
        # 비CSV 방어
        if not csv_path.lower().endswith(".csv"):
            return {"file": os.path.basename(csv_path), "n_rows": None, "ok": False, "reason": "not a CSV"}

        df = pd.read_csv(csv_path)

        # 숫자형 변환
        for col in ["thumbs_up","thumbs_down","reply_count","stars","best_score","user_reputation","recipe_number","recipe_code"]:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        if target not in df.columns:
            return {"file": os.path.basename(csv_path), "n_rows": len(df), "ok": False, "reason": f"missing target {target}"}

        # 타깃 결측 제거
        df = df.dropna(subset=[target]).copy()
        if len(df) < 20:
            return {"file": os.path.basename(csv_path), "n_rows": len(df), "ok": False, "reason": "too few rows after dropna"}

        y = df[target].astype(float)

        # 피처 컬럼
        text_col = "text" if "text" in df.columns else None
        num_cols = [c for c in ["thumbs_up","thumbs_down","reply_count","best_score","user_reputation"] if c in df.columns]
        cat_cols = [c for c in ["user_id","user_name","recipe_name"] if c in df.columns]
        time_src = "created_at" if "created_at" in df.columns else None

        if text_col and text_col in df.columns:
            df[text_col] = df[text_col].fillna("")

        def time_transformer_df(X):
            if time_src is None or time_src not in X:
                return pd.DataFrame(index=X.index)
            return make_time_features(X[time_src])

        transformers = []
        if text_col and text_col in df.columns:
            transformers.append((
                "text",
                TfidfVectorizer(max_features=40000, ngram_range=(1,3), min_df=3, lowercase=True),
                text_col
            ))
        if num_cols:
            transformers.append((
                "num",
                Pipeline(steps=[
                    ("imp", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler())
                ]),
                num_cols
            ))
        if cat_cols:
            transformers.append((
                "cat",
                Pipeline(steps=[
                    ("imp", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore"))
                ]),
                cat_cols
            ))
        if time_src:
            transformers.append((
                "time",
                Pipeline(steps=[
                    ("gen", FunctionTransformer(lambda X: time_transformer_df(X), validate=False)),
                    ("imp", SimpleImputer(strategy="most_frequent")),
                    ("scaler", StandardScaler())
                ]),
                [time_src]
            ))

        preprocess = ColumnTransformer(
            transformers=transformers,
            remainder="drop",
            sparse_threshold=0.3
        )

        model = Ridge(alpha=1.0, random_state=42)

        pipe = Pipeline(steps=[
            ("prep", preprocess),
            ("reg", model)
        ])

        X_train, X_test, y_train, y_test = train_test_split(
            df, y, test_size=0.2, random_state=42
        )

        pipe.fit(X_train, y_train)

        pred = pipe.predict(X_test)
        mse  = mean_squared_error(y_test, pred)   # squared=False 미사용
        rmse = mse ** 0.5
        mae  = mean_absolute_error(y_test, pred)
        r2   = r2_score(y_test, pred)

        return {
            "file": os.path.basename(csv_path),
            "n_rows": len(df),
            "ok": True,
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

    except Exception as e:
        return {"file": os.path.basename(csv_path), "n_rows": None, "ok": False, "reason": str(e)}

def main():
    # 오직 CSV만: out_by_recipe/by_recipe_code_*.csv
    paths = sorted(glob.glob(os.path.join("out_by_recipe", "by_recipe_code_*.csv")))
    results = []
    for p in paths:
        res = evaluate_file(p, target="stars")
        results.append(res)
        print(res)
    pd.DataFrame(results).to_csv("results_summary.csv", index=False, encoding="utf-8-sig")
    print("Saved results_summary.csv")

if __name__ == "__main__":
    main()


{'file': 'by_recipe_code_100276.csv', 'n_rows': 164, 'ok': True, 'rmse': 0.8400779023087291, 'mae': 0.6724192243970601, 'r2': -0.0916774580124382}
{'file': 'by_recipe_code_10248.csv', 'n_rows': 130, 'ok': True, 'rmse': 1.0203422486572338, 'mae': 0.9007075926601461, 'r2': 0.296920625603444}
{'file': 'by_recipe_code_10252.csv', 'n_rows': 254, 'ok': True, 'rmse': 1.1466020775405217, 'mae': 0.8927893508959533, 'r2': 0.6867419256781928}
{'file': 'by_recipe_code_1063.csv', 'n_rows': 158, 'ok': True, 'rmse': 1.7256058511138277, 'mae': 0.9602892623979357, 'r2': -0.14158769250461845}
{'file': 'by_recipe_code_1081.csv', 'n_rows': 148, 'ok': True, 'rmse': 1.3687470441730798, 'mae': 0.9698925999798031, 'r2': 0.27820135965783854}
{'file': 'by_recipe_code_11330.csv', 'n_rows': 112, 'ok': True, 'rmse': 1.8204245068983849, 'mae': 1.368338042805612, 'r2': -0.02160670677872112}
{'file': 'by_recipe_code_1152.csv', 'n_rows': 157, 'ok': True, 'rmse': 1.5709193389222813, 'mae': 1.0756179315709509, 'r2': -0.

In [21]:
import glob, os
import pandas as pd
import numpy as np

# 1) 통합 로드
paths = sorted(glob.glob(os.path.join("out_by_recipe", "by_recipe_code_*.csv")))
usecols = ["recipe_code","recipe_name","user_id","user_name","created_at",
           "reply_count","thumbs_up","thumbs_down","stars","best_score","text"]
dfs = []
for p in paths:
    df = pd.read_csv(p, usecols=[c for c in usecols if c in pd.read_csv(p, nrows=0).columns])
    dfs.append(df)
all_df = pd.concat(dfs, ignore_index=True)

# 2) 타입/결측 처리
for c in ["reply_count","thumbs_up","thumbs_down","stars","best_score"]:
    if c in all_df.columns:
        all_df[c] = pd.to_numeric(all_df[c], errors="coerce")
all_df = all_df.dropna(subset=["stars"])  # 예: 타깃 보존

# 3) EDA 예시 (집계/분포)
agg_recipe = all_df.groupby("recipe_code").agg(
    n=("stars","size"),
    mean_star=("stars","mean"),
    up_sum=("thumbs_up","sum"),
    down_sum=("thumbs_down","sum")
).reset_index().sort_values("n", ascending=False)
agg_recipe.to_csv("agg_recipe.csv", index=False, encoding="utf-8-sig")

# 4) 상관/산점도 샘플 (전 데이터)
import matplotlib.pyplot as plt
sample = all_df[["thumbs_up","stars"]].dropna().sample(min(20000, len(all_df)), random_state=42)
x, y = sample["thumbs_up"].to_numpy(), sample["stars"].to_numpy()
m, b = np.polyfit(x, y, 1)
xx = np.linspace(x.min(), x.max(), 200); yy = m*xx + b
plt.figure(figsize=(6,4))
plt.scatter(x, y, s=5, alpha=0.2)
plt.plot(xx, yy, color="red")
plt.xlabel("thumbs_up"); plt.ylabel("stars")
plt.title("All-data linear fit: thumbs_up vs stars")
plt.tight_layout(); plt.savefig("plot_all_thumbs_vs_stars.png", dpi=150); plt.close()

# 5) 전 데이터 회귀 모델(텍스트 + 메타) 학습 후 성능 저장
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def make_time_features(s):
    dt = pd.to_datetime(s, unit="s", errors="coerce")
    return pd.DataFrame({
        "year": dt.dt.year, "month": dt.dt.month,
        "dayofweek": dt.dt.dayofweek, "hour": dt.dt.hour
    })

target = "stars"
text_col = "text" if "text" in all_df.columns else None
num_cols = [c for c in ["reply_count","thumbs_up","thumbs_down","best_score"] if c in all_df.columns]
cat_cols = [c for c in ["user_id","user_name","recipe_name","recipe_code"] if c in all_df.columns]
time_src = "created_at" if "created_at" in all_df.columns else None

X = all_df.copy()
y = all_df[target].astype(float)
if text_col and text_col in X.columns:
    X[text_col] = X[text_col].fillna("")

def time_transformer_df(X):
    if time_src is None or time_src not in X:
        return pd.DataFrame(index=X.index)
    return make_time_features(X[time_src])

transformers = []
if text_col and text_col in X.columns:
    transformers.append(("text", TfidfVectorizer(max_features=40000, ngram_range=(1,3), min_df=3, lowercase=True), text_col))
if num_cols:
    transformers.append(("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols))
if cat_cols:
    transformers.append(("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                                          ("onehot", OneHotEncoder(handle_unknown="ignore"))]), cat_cols))
if time_src:
    transformers.append(("time", Pipeline([("gen", FunctionTransformer(lambda X: time_transformer_df(X), validate=False)),
                                          ("imp", SimpleImputer(strategy="most_frequent")), ("scaler", StandardScaler())]), [time_src]))

prep = ColumnTransformer(transformers=transformers, remainder="drop", sparse_threshold=0.3)
model = Ridge(alpha=1.0, random_state=42)
pipe = Pipeline([("prep", prep), ("reg", model)])

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_tr, y_tr)
pred = pipe.predict(X_te)
mse = mean_squared_error(y_te, pred); rmse = mse**0.5
mae = mean_absolute_error(y_te, pred); r2 = r2_score(y_te, pred)
pd.DataFrame([{"rmse": rmse, "mae": mae, "r2": r2, "n_train": len(X_tr), "n_test": len(X_te)}]).to_csv("overall_model_perf.csv", index=False, encoding="utf-8-sig")

# 6) 레시피코드 그룹 성능(그룹-아웃 검증 유사)
# 간단 버전: 각 그룹별 훈련셋 평균 타깃으로 베이스라인 비교 등 필요 시 확장
