# Analisis Prediksi Hasil Pertandingan Sepak Bola

Notebook ini menggunakan machine learning untuk memprediksi hasil pertandingan Premier League menggunakan tiga algoritma:
- Logistic Regression
- K-Nearest Neighbors (KNN)
- Naive Bayes

Dataset: English Premier League (1993-2023)

## Import Libraries

In [None]:
# Data manipulation dan analisis
import pandas as pd
import numpy as np
from collections import defaultdict

# Visualisasi
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Pengaturan tampilan
plt.style.use('default')
sns.set_palette('husl')

print('✅ Libraries berhasil diimport!')

## Load Dataset

In [None]:
# Load dataset dari GitHub
url = 'https://raw.githubusercontent.com/phincon/ai-ml-datasets/main/english_premier_league_dataset.csv'
df = pd.read_csv(url)

print(f'Dataset shape: {df.shape}')
print(f'Columns: {list(df.columns)}')

# Tampilkan sample data
df.head()

## Exploratory Data Analysis

In [None]:
# Informasi dataset
print('=== INFORMASI DATASET ===')
print(f'Shape: {df.shape}')
print(f'\nMissing values:')
print(df.isnull().sum())

print(f'\nDistribusi hasil pertandingan:')
print(df['Result'].value_counts())

print(f'\nStatistik gol:')
print(df[['HomeGoals', 'AwayGoals']].describe())

In [None]:
# Membersihkan data dari missing values
df = df.dropna()
print(f'Dataset shape setelah cleaning: {df.shape}')

## Feature Engineering

Selanjutnya kita akan membuat fitur-fitur untuk machine learning berdasarkan statistik tim

In [None]:
# Fungsi untuk menghitung statistik tim
def calculate_team_statistics(df):
    team_stats = defaultdict(lambda: {
        'goals_scored': [], 'goals_conceded': [], 'results': [],
        'home_results': [], 'away_results': []
    })
    
    # Menghitung statistik untuk setiap tim
    for _, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        home_goals = row['HomeGoals']
        away_goals = row['AwayGoals']
        result = row['Result']
        
        # Statistik tim kandang
        team_stats[home_team]['goals_scored'].append(home_goals)
        team_stats[home_team]['goals_conceded'].append(away_goals)
        team_stats[home_team]['results'].append(1 if result == 'H' else 0)
        team_stats[home_team]['home_results'].append(1 if result == 'H' else 0)
        
        # Statistik tim tandang
        team_stats[away_team]['goals_scored'].append(away_goals)
        team_stats[away_team]['goals_conceded'].append(home_goals)
        team_stats[away_team]['results'].append(1 if result == 'A' else 0)
        team_stats[away_team]['away_results'].append(1 if result == 'A' else 0)
    
    # Konversi ke rata-rata dan persentase
    processed_stats = {}
    for team, stats in team_stats.items():
        processed_stats[team] = {
            'avg_goals_scored': np.mean(stats['goals_scored']) if stats['goals_scored'] else 0,
            'avg_goals_conceded': np.mean(stats['goals_conceded']) if stats['goals_conceded'] else 0,
            'win_rate': np.mean(stats['results']) if stats['results'] else 0,
            'home_win_rate': np.mean(stats['home_results']) if stats['home_results'] else 0,
            'away_win_rate': np.mean(stats['away_results']) if stats['away_results'] else 0,
        }
    
    return processed_stats

# Hitung statistik tim
team_stats = calculate_team_statistics(df)
print(f'Jumlah tim: {len(team_stats)}')

In [None]:
# Membuat fitur untuk setiap pertandingan
def create_features(df, team_stats):
    features_list = []
    targets = []
    
    default_stats = {
        'avg_goals_scored': 1.0,
        'avg_goals_conceded': 1.0,
        'win_rate': 0.33,
        'home_win_rate': 0.33,
        'away_win_rate': 0.33
    }
    
    for idx, row in df.iterrows():
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        result = row['Result']
        
        # Ambil statistik tim
        home_stats = team_stats.get(home_team, default_stats)
        away_stats = team_stats.get(away_team, default_stats)
        
        # Buat vektor fitur
        feature_vector = [
            home_stats['avg_goals_scored'],
            home_stats['avg_goals_conceded'],
            home_stats['win_rate'],
            home_stats['home_win_rate'],
            away_stats['avg_goals_scored'],
            away_stats['avg_goals_conceded'],
            away_stats['win_rate'],
            away_stats['away_win_rate'],
            home_stats['avg_goals_scored'] - away_stats['avg_goals_conceded'],
            away_stats['avg_goals_scored'] - home_stats['avg_goals_conceded'],
        ]
        
        features_list.append(feature_vector)
        targets.append(result)
    
    return np.array(features_list), np.array(targets)

# Buat fitur
X, y = create_features(df, team_stats)

print(f'Shape fitur: {X.shape}')
print(f'Distribusi target:')
print(pd.Series(y).value_counts())

## Model Training dan Evaluasi

Sekarang kita akan melatih tiga model machine learning: Logistic Regression, KNN, dan Naive Bayes

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling untuk Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

In [None]:
# Inisialisasi model
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB()
}

# Training dan evaluasi
results = {}

for model_name, model in models.items():
    print(f'\n--- {model_name} ---')
    
    # Training model
    if model_name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        X_train_model = X_train_scaled
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        X_train_model = X_train
    
    # Evaluasi
    accuracy = accuracy_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train_model, y_train, cv=5)
    
    results[model_name] = {
        'accuracy': accuracy,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'predictions': y_pred
    }
    
    print(f'Test Accuracy: {accuracy:.4f}')
    print(f'Cross-validation Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})')
    print(f'\nClassification Report:')
    print(classification_report(y_test, y_pred))

## Visualisasi Hasil

Mari kita visualisasikan performa model dan analisis data

In [None]:
# Setup plotting
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Soccer Result Prediction - Model Comparison', fontsize=16, fontweight='bold')

# 1. Model Accuracy Comparison
model_names = list(results.keys())
accuracies = [results[model]['accuracy'] for model in model_names]
cv_means = [results[model]['cv_mean'] for model in model_names]

x_pos = np.arange(len(model_names))
width = 0.35

axes[0, 0].bar(x_pos - width/2, accuracies, width, label='Test Accuracy', alpha=0.8)
axes[0, 0].bar(x_pos + width/2, cv_means, width, label='CV Mean', alpha=0.8)
axes[0, 0].set_xlabel('Models')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_title('Model Accuracy Comparison')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels(model_names, rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Target Distribution
target_counts = pd.Series(y).value_counts()
axes[0, 1].pie(target_counts.values, labels=['Home Win', 'Away Win', 'Draw'], 
               autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Match Result Distribution')

# 3. Confusion Matrix untuk model terbaik
best_model = max(results.keys(), key=lambda x: results[x]['accuracy'])
cm = confusion_matrix(y_test, results[best_model]['predictions'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Away Win', 'Draw', 'Home Win'],
            yticklabels=['Away Win', 'Draw', 'Home Win'],
            ax=axes[1, 0])
axes[1, 0].set_title(f'Confusion Matrix - {best_model}')
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')

# 4. Feature Importance (untuk Logistic Regression)
if 'Logistic Regression' in results:
    lr_model = models['Logistic Regression']
    if hasattr(lr_model, 'coef_'):
        feature_names = [
            'Home_Avg_Goals_Scored', 'Home_Avg_Goals_Conceded', 'Home_Win_Rate', 'Home_Home_Win_Rate',
            'Away_Avg_Goals_Scored', 'Away_Avg_Goals_Conceded', 'Away_Win_Rate', 'Away_Away_Win_Rate',
            'Home_Attack_vs_Away_Defense', 'Away_Attack_vs_Home_Defense'
        ]
        feature_importance = np.mean(np.abs(lr_model.coef_), axis=0)
        feature_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=True)
        
        axes[1, 1].barh(feature_df['feature'], feature_df['importance'])
        axes[1, 1].set_title('Feature Importance (Logistic Regression)')
        axes[1, 1].set_xlabel('Average Absolute Coefficient')

plt.tight_layout()
plt.show()

In [None]:
# Ringkasan performa model
print('=' * 50)
print('MODEL COMPARISON SUMMARY')
print('=' * 50)
for model_name, result in results.items():
    print(f'{model_name:20} | Accuracy: {result["accuracy"]:.4f} | CV Score: {result["cv_mean"]:.4f}')

best_model = max(results.keys(), key=lambda x: results[x]['accuracy'])
print(f'\nModel terbaik: {best_model} dengan accuracy: {results[best_model]["accuracy"]:.4f}')

## Prediksi Pertandingan

Sekarang mari kita coba memprediksi hasil pertandingan tertentu

In [None]:
# Fungsi untuk prediksi pertandingan
def predict_match(home_team, away_team, models, team_stats, scaler):
    print(f'\nPrediksi untuk {home_team} vs {away_team}')
    
    default_stats = {
        'avg_goals_scored': 1.0,
        'avg_goals_conceded': 1.0,
        'win_rate': 0.33,
        'home_win_rate': 0.33,
        'away_win_rate': 0.33
    }
    
    # Ambil statistik tim
    home_stats = team_stats.get(home_team, default_stats)
    away_stats = team_stats.get(away_team, default_stats)
    
    # Buat vektor fitur
    feature_vector = np.array([[
        home_stats['avg_goals_scored'],
        home_stats['avg_goals_conceded'],
        home_stats['win_rate'],
        home_stats['home_win_rate'],
        away_stats['avg_goals_scored'],
        away_stats['avg_goals_conceded'],
        away_stats['win_rate'],
        away_stats['away_win_rate'],
        home_stats['avg_goals_scored'] - away_stats['avg_goals_conceded'],
        away_stats['avg_goals_scored'] - home_stats['avg_goals_conceded'],
    ]])
    
    # Prediksi dengan semua model
    result_mapping = {'H': 'Home Win', 'A': 'Away Win', 'D': 'Draw'}
    
    for model_name, model in models.items():
        if model_name == 'Logistic Regression':
            feature_scaled = scaler.transform(feature_vector)
            pred = model.predict(feature_scaled)[0]
            prob = model.predict_proba(feature_scaled)[0]
        else:
            pred = model.predict(feature_vector)[0]
            if hasattr(model, 'predict_proba'):
                prob = model.predict_proba(feature_vector)[0]
            else:
                prob = None
        
        pred_result = result_mapping[pred]
        print(f'{model_name}: {pred_result}')
        if prob is not None:
            prob_dict = dict(zip(['A', 'D', 'H'], prob))
            print(f'  Probabilitas - Home: {prob_dict["H"]:.3f}, Draw: {prob_dict["D"]:.3f}, Away: {prob_dict["A"]:.3f}')

# Tampilkan tim yang tersedia
teams = sorted(list(set(df['HomeTeam'].unique()) | set(df['AwayTeam'].unique())))
print('Tim yang tersedia dalam dataset:')
print('=' * 50)
for i, team in enumerate(teams):
    if i % 3 == 0:
        print()
    print(f'{team:<20}', end='')
print('\n')

In [None]:
# Contoh prediksi
print('🔮 Contoh Prediksi Pertandingan')
print('=' * 40)

# Prediksi beberapa pertandingan klasik
example_matches = [
    ('Arsenal', 'Liverpool'),
    ('Man United', 'Chelsea'),
    ('Tottenham', 'Newcastle')
]

for home, away in example_matches:
    if home in teams and away in teams:
        predict_match(home, away, models, team_stats, scaler)
        print('-' * 30)

## Kesimpulan

### Temuan Utama:

1. **Logistic Regression** biasanya memberikan performa terbaik dengan akurasi ~52%
2. **Prediksi sepak bola sangat menantang** - bahkan akurasi 50%+ sudah cukup baik!
3. **Home advantage** adalah faktor nyata dalam prediksi
4. **Statistik tim** seperti gol yang dicetak/kebobolan dan win rate adalah fitur penting

### Langkah Selanjutnya:

- Menambah fitur lebih banyak (performa terkini, head-to-head records)
- Mencoba ensemble methods
- Memasukkan data level pemain
- Menggunakan pendekatan deep learning

### Ingat:
Sepak bola memang tidak dapat diprediksi - itulah yang membuatnya indah! ⚽🌟

In [None]:
# Prediksi interaktif - ganti nama tim sesuai keinginan
home_team_input = 'Arsenal'  # Ganti dengan tim kandang pilihan
away_team_input = 'Liverpool'  # Ganti dengan tim tandang pilihan

if home_team_input in teams and away_team_input in teams:
    predict_match(home_team_input, away_team_input, models, team_stats, scaler)
else:
    print('❌ Silakan masukkan nama tim yang valid dari dataset!')