# ĐỒ ÁN: DỰ ĐOÁN KẾT QUẢ TENNIS ATP & HỆ THỐNG HYBRID ELO

---

## 1. Cài đặt thư viện và Cấu hình

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay

# Cấu hình hiển thị
sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

## 2. Hệ thống Hybrid Elo (Tự xây dựng)
Đây là thuật toán tính điểm sức mạnh động dựa trên cả phong độ tổng quát và mặt sân.

In [None]:
class TennisEloModel:
    def __init__(self, k_factor=20, surface_weight=0.5):
        self.k = k_factor
        self.alpha = surface_weight
        self.overall_elo = {}
        self.surface_elo = {}
        self.start_elo = 1500

    def get_elo(self, pid, surface):
        overall = self.overall_elo.get(pid, self.start_elo)
        if pid not in self.surface_elo: self.surface_elo[pid] = {}
        s_elo = self.surface_elo[pid].get(surface, self.start_elo)
        final = (1 - self.alpha) * overall + self.alpha * s_elo
        return final, overall, s_elo

    def update(self, wid, lid, surface):
        w_final, w_over, w_surf = self.get_elo(wid, surface)
        l_final, l_over, l_surf = self.get_elo(lid, surface)
        
        prob_w = 1 / (1 + 10 ** ((l_final - w_final) / 400))
        delta = self.k * (1 - prob_w)
        
        self.overall_elo[wid] = w_over + delta
        self.overall_elo[lid] = l_over - delta
        
        if wid not in self.surface_elo: self.surface_elo[wid] = {}
        if lid not in self.surface_elo: self.surface_elo[lid] = {}
        
        self.surface_elo[wid][surface] = w_surf + delta
        self.surface_elo[lid][surface] = l_surf - delta

    def fit_transform(self, df):
        p1_elos, p2_elos = [], []
        for _, row in df.iterrows():
            p1, p2, surf = row['p1_id'], row['p2_id'], row['surface']
            target = row['target']
            
            e1, _, _ = self.get_elo(p1, surf)
            e2, _, _ = self.get_elo(p2, surf)
            
            p1_elos.append(e1)
            p2_elos.append(e2)
            
            if target == 1: self.update(p1, p2, surf)
            else: self.update(p2, p1, surf)
            
        df['p1_elo'] = p1_elos
        df['p2_elo'] = p2_elos
        df['elo_diff'] = df['p1_elo'] - df['p2_elo']
        return df

## 3. Tiền xử lý dữ liệu (Data Pipeline)
Bao gồm: Load data, Rolling Stats, Restructuring (Chống Leakage) và Clean.

In [None]:
def process_data(data_path='data'):
    # 1. Load Data
    files = glob.glob(os.path.join(data_path, 'atp_matches_*.csv'))
    if not files: return None
    df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
    df['tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')
    df = df.sort_values(['tourney_date', 'match_num']).reset_index(drop=True)
    
    # 2. Rolling Stats (Form)
    last_5, surface_stats = {}, {}
    w_forms, l_forms, w_surfs, l_surfs = [], [], [], []
    
    for _, row in df.iterrows():
        wid, lid, surf = row['winner_id'], row['loser_id'], row['surface']
        
        w_forms.append(np.mean(last_5.get(wid, [0])))
        l_forms.append(np.mean(last_5.get(lid, [0])))
        
        ws = surface_stats.get(wid, {}).get(surf, [0, 0])
        w_surfs.append(ws[0]/ws[1] if ws[1]>0 else 0)
        ls = surface_stats.get(lid, {}).get(surf, [0, 0])
        l_surfs.append(ls[0]/ls[1] if ls[1]>0 else 0)
        
        # Update
        last_5.setdefault(wid, []).append(1)
        if len(last_5[wid]) > 5: last_5[wid].pop(0)
        last_5.setdefault(lid, []).append(0)
        if len(last_5[lid]) > 5: last_5[lid].pop(0)
        
        surface_stats.setdefault(wid, {}).setdefault(surf, [0, 0])
        surface_stats[wid][surf][0] += 1; surface_stats[wid][surf][1] += 1
        surface_stats.setdefault(lid, {}).setdefault(surf, [0, 0])
        surface_stats[lid][surf][1] += 1
        
    df['winner_form'] = w_forms; df['loser_form'] = l_forms
    df['winner_surf'] = w_surfs; df['loser_surf'] = l_surfs
    
    # 3. Restructure (Anti-Leakage)
    np.random.seed(42)
    swap = np.random.rand(len(df)) < 0.5
    new_df = pd.DataFrame({'tourney_date': df['tourney_date']})
    
    feats = ['id', 'rank', 'age', 'ht', 'form', 'surf', 'seed']
    for f in feats:
        w_col, l_col = f'winner_{f}', f'loser_{f}'
        # Handle special names
        if f == 'form': w_col, l_col = 'winner_form', 'loser_form'
        if f == 'surf': w_col, l_col = 'winner_surf', 'loser_surf'
        
        if w_col in df.columns:
            new_df[f'p1_{f}'] = np.where(swap, df[l_col], df[w_col])
            new_df[f'p2_{f}'] = np.where(swap, df[w_col], df[l_col])
            
    new_df['target'] = np.where(swap, 0, 1)
    new_df['surface'] = df['surface']
    
    # 4. Calculate Elo
    elo_model = TennisEloModel()
    new_df = elo_model.fit_transform(new_df)
    
    # 5. Clean & Engineer
    new_df.dropna(subset=['p1_rank', 'p2_rank', 'p1_ht', 'p2_ht'], inplace=True)
    new_df['rank_diff'] = new_df['p1_rank'] - new_df['p2_rank']
    new_df['age_diff'] = new_df['p1_age'] - new_df['p2_age']
    new_df['ht_diff'] = new_df['p1_ht'] - new_df['p2_ht']
    new_df['form_diff'] = new_df['p1_form'] - new_df['p2_form']
    new_df['surf_diff'] = new_df['p1_surf'] - new_df['p2_surf']
    
    return new_df

print("Đang xử lý dữ liệu...")
df_clean = process_data()
print(f"Dữ liệu sạch: {df_clean.shape}")

## 4. Huấn luyện Mô hình Ensemble
Sử dụng Soft Voting với 4 model: Logistic Regression, Random Forest, XGBoost và SVM.

In [None]:
# Prepare Data
train_size = int(len(df_clean) * 0.8)
features = ['rank_diff', 'elo_diff', 'age_diff', 'ht_diff', 'form_diff', 'surf_diff']

X = df_clean[features]
y = df_clean['target']

X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Scale
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Define Models
models = [
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=10)),
    ('xgb', XGBClassifier(n_estimators=100, max_depth=5)),
    ('svm', SVC(probability=True))
]

# Voting Ensemble
voting_clf = VotingClassifier(estimators=models, voting='soft')
voting_clf.fit(X_train_s, y_train)

# Evaluate
y_pred = voting_clf.predict(X_test_s)
y_prob = voting_clf.predict_proba(X_test_s)[:, 1]

acc = accuracy_score(y_test, y_pred)
loss = log_loss(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

print(f"--- KẾT QUẢ SOFT VOTING ---")
print(f"Accuracy: {acc:.4f}")
print(f"Log Loss: {loss:.4f}")
print(f"ROC-AUC : {auc_score:.4f}")

## 5. Trực quan hóa Kết quả

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'Soft Voting (AUC={auc_score:.2f})', color='darkorange', lw=2)
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

# Feature Importance (Using RF inside)
rf_model = voting_clf.named_estimators_['rf']
imps = rf_model.feature_importances_
plt.figure(figsize=(8, 5))
plt.barh(features, imps, color='skyblue')
plt.title('Feature Importance (Random Forest Component)')
plt.xlabel('Importance')
plt.show()