In [None]:
# -*- coding: utf-8 -*-
import psycopg2
import pandas as pd
from collections import deque
import numpy as np

# =============================================================
# 1. DB 설정 및 데이터 로딩 (이전과 동일)
# =============================================================
DB_CONFIG = {
    "dbname": "laions_db", "user": "postgres", "password": "1111",
    "host": "localhost", "port": "5432", "client_encoding": "UTF8"
}

def get_db_connection():
    return psycopg2.connect(**DB_CONFIG)

def load_all_years_data():
    conn = None
    try:
        conn = get_db_connection()
        SQL_QUERY = """
            SELECT game_id, game_date, home_team, away_team, home_score, away_score
            FROM kbo_cleaned_games ORDER BY game_date;
        """
        df = pd.read_sql(SQL_QUERY, conn)
        df['game_date'] = pd.to_datetime(df['game_date'])
        df['result'] = 0.5
        df.loc[df['home_score'] > df['away_score'], 'result'] = 1
        df.loc[df['away_score'] > df['home_score'], 'result'] = 0
        print(f"✅ 모든 연도 데이터 로드 완료: 총 {len(df)} 경기")
        return df
    finally:
        if conn: conn.close()

# =============================================================
# 2. 피처 엔지니어링 함수들 (피타고리안 포함)
# =============================================================

def calculate_elo(df, initial_elo=1500, k_factor=20):
    # (이전과 동일)
    teams = pd.unique(df[['home_team', 'away_team']].values.ravel())
    elo_ratings = {team: initial_elo for team in teams}
    elo_history = []
    for index, row in df.iterrows():
        home, away, result = row['home_team'], row['away_team'], row['result']
        R_home, R_away = elo_ratings[home], elo_ratings[away]
        elo_history.append({'home_elo_before': R_home, 'away_elo_before': R_away})
        E_home = 1 / (1 + 10**((R_away - R_home) / 400))
        new_R_home = R_home + k_factor * (result - E_home)
        new_R_away = R_away + k_factor * ((1 - result) - (1 - E_home))
        elo_ratings[home], elo_ratings[away] = new_R_home, new_R_away
    print("✅ Elo Rating 계산 완료.")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame(elo_history)], axis=1)

def calculate_advanced_features(df, n_games=10):
    # (이전과 동일)
    teams = pd.unique(df[['home_team', 'away_team']].values.ravel())
    last_played_date = {team: pd.NaT for team in teams}
    recent_results = {team: deque(maxlen=n_games) for team in teams}
    advanced_features = []
    for index, row in df.iterrows():
        home, away, game_date, result = row['home_team'], row['away_team'], row['game_date'], row['result']
        home_rest = (game_date - last_played_date[home]).days if pd.notna(last_played_date[home]) else 10
        away_rest = (game_date - last_played_date[away]).days if pd.notna(last_played_date[away]) else 10
        home_form = sum(recent_results[home]) / len(recent_results[home]) if recent_results[home] else 0
        away_form = sum(recent_results[away]) / len(recent_results[away]) if recent_results[away] else 0
        advanced_features.append({'home_rest': home_rest, 'away_rest': away_rest, 'home_form': home_form, 'away_form': away_form})
        last_played_date[home], last_played_date[away] = game_date, game_date
        recent_results[home].append(result)
        recent_results[away].append(1 - result)
    print(f"✅ 휴식일 및 최근 {n_games}경기 성적 계산 완료.")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame(advanced_features)], axis=1)

def calculate_pythagorean_win_pct(df):
    """누적 득점/실점을 바탕으로 피타고리안 기대 승률을 계산합니다."""
    teams = pd.unique(df[['home_team', 'away_team']].values.ravel())
    runs_scored = {team: 0 for team in teams}
    runs_allowed = {team: 0 for team in teams}
    pythagorean_features = []
    
    for index, row in df.iterrows():
        home, away, home_score, away_score = row['home_team'], row['away_team'], row['home_score'], row['away_score']
        home_rs, home_ra = runs_scored[home], runs_allowed[home]
        away_rs, away_ra = runs_scored[away], runs_allowed[away]
        
        home_pythagorean = (home_rs**2) / (home_rs**2 + home_ra**2) if (home_rs > 0 or home_ra > 0) else 0.5
        away_pythagorean = (away_rs**2) / (away_rs**2 + away_ra**2) if (away_rs > 0 or away_ra > 0) else 0.5
        
        pythagorean_features.append({'home_pythagorean': home_pythagorean, 'away_pythagorean': away_pythagorean})
        
        runs_scored[home] += home_score
        runs_allowed[home] += away_score
        runs_scored[away] += away_score
        runs_allowed[away] += home_score
        
    print("✅ 피타고리안 기대 승률 계산 완료.")
    return pd.concat([df.reset_index(drop=True), pd.DataFrame(pythagorean_features)], axis=1)

# --- ⚠️ `create_samsung_dataset` 함수 수정 ---
def create_samsung_dataset(df, team_name="삼성"):
    """삼성 관점으로 데이터를 재구성하고, 최종 피처를 생성합니다."""
    team_games = df[(df['home_team'] == team_name) | (df['away_team'] == team_name)].copy()
    is_home_game = team_games['home_team'] == team_name
    
    # 기본 피처 변환 (기존과 동일)
    team_games['samsung_elo'] = np.where(is_home_game, team_games['home_elo_before'], team_games['away_elo_before'])
    team_games['opponent_elo'] = np.where(is_home_game, team_games['away_elo_before'], team_games['home_elo_before'])
    samsung_rest = np.where(is_home_game, team_games['home_rest'], team_games['away_rest'])
    opponent_rest = np.where(is_home_game, team_games['away_rest'], team_games['home_rest'])
    team_games['samsung_form'] = np.where(is_home_game, team_games['home_form'], team_games['away_form'])
    team_games['opponent_form'] = np.where(is_home_game, team_games['away_form'], team_games['home_form'])
    team_games['samsung_pythagorean'] = np.where(is_home_game, team_games['home_pythagorean'], team_games['away_pythagorean'])
    team_games['opponent_pythagorean'] = np.where(is_home_game, team_games['away_pythagorean'], team_games['home_pythagorean'])
    
    # 새로운 통합 피처 생성 (기존과 동일)
    team_games['rest_diff'] = samsung_rest - opponent_rest

    # 타겟 변수 생성 (기존과 동일)
    team_games['samsung_win'] = np.where(is_home_game, team_games['result'], 1 - team_games['result'])
    team_games = team_games[team_games['samsung_win'] != 0.5]
    team_games['samsung_win'] = team_games['samsung_win'].astype(int)

    # =================================================================
    # ⭐️ 2. 최종 컬럼 목록에 'game_id' 추가 ⭐️
    # =================================================================
    final_cols = [
        'game_id',  # 👈👈👈 여기에 추가!
        'game_date', 'samsung_win', 'samsung_elo', 'opponent_elo',
        'rest_diff',
        'samsung_form', 'opponent_form',
        'samsung_pythagorean', 'opponent_pythagorean'
    ]
    print(f"✅ {team_name} 라이온즈 맞춤형 데이터셋 생성 완료. ('game_id' 최종 포함)")
    return team_games[final_cols]

# =============================================================
# 3. 메인 파이프라인 실행
# =============================================================
def run_full_pipeline_and_split():
    """모든 과정을 실행하고, 최종적으로 훈련/테스트 데이터셋을 분리합니다."""
    all_games_df = load_all_years_data()
    if all_games_df.empty: return

    elo_df = calculate_elo(all_games_df)
    advanced_df = calculate_advanced_features(elo_df)
    pythagorean_df = calculate_pythagorean_win_pct(advanced_df) # 피타고리안 계산 추가
    samsung_df_all_years = create_samsung_dataset(pythagorean_df, team_name="삼성")

    TEST_YEAR = 2025
    train_df = samsung_df_all_years[samsung_df_all_years['game_date'].dt.year < TEST_YEAR]
    test_df = samsung_df_all_years[samsung_df_all_years['game_date'].dt.year == TEST_YEAR]

    print("\n--- 📊 데이터 스플릿 완료 ---")
    print(f"훈련 데이터: {len(train_df)} 경기, 테스트 데이터: {len(test_df)} 경기")

    train_df.to_csv("samsung_train_dataset.csv", index=False, encoding='utf-8-sig')
    test_df.to_csv("samsung_test_dataset.csv", index=False, encoding='utf-8-sig')
    print("✅ 훈련 및 테스트 데이터셋이 CSV 파일로 저장되었습니다.")
    
    return train_df, test_df

# --- 최종 실행 ---
if __name__ == "__main__":
    train_dataset, test_dataset = run_full_pipeline_and_split()

  df = pd.read_sql(SQL_QUERY, conn)


✅ 모든 연도 데이터 로드 완료: 총 2880 경기
✅ Elo Rating 계산 완료.
✅ 휴식일 및 최근 10경기 성적 계산 완료.
✅ 피타고리안 기대 승률 계산 완료.
✅ 삼성 라이온즈 맞춤형 데이터셋 생성 완료. ('game_id' 최종 포함)

--- 📊 데이터 스플릿 완료 ---
훈련 데이터: 427 경기, 테스트 데이터: 142 경기
✅ 훈련 및 테스트 데이터셋이 CSV 파일로 저장되었습니다.
