In [86]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
file_path = '../output/matches_with_players.csv'
df = pd.read_csv(file_path)

# Drop the columns that are not needed
columns_to_drop = [
    'Game ID', 'Date', 'G Home', 'G Away', 'Players Found %',
    'Match Link', 'id', 'Wk', 'Day', 'Time', 'League', 'Season',
    'Home Points', 'Away Points',
    'Home', 'Away',
#      'Home Avg Points',
#        'Away Avg Points', 'Home Avg Goals For', 'Away Avg Goals For',
#        'Home Avg Goals Against', 'Away Avg Goals Against',
#        'Home Matches Played', 'Away Matches Played', 
#         'Home Points/Match','Away Points/Match', 
#         'Home Form Points', 'Away Form Points',
#        'Home Form Goals For', 'Away Form Goals For', 'Home Form Goals Against',
#        'Away Form Goals Against', 'Home Head-to-Head Points',
#        'Away Head-to-Head Points', 'Home Head-to-Head Goals For',
#        'Away Head-to-Head Goals For', 'Home Head-to-Head Goals Against',
#        'Away Head-to-Head Goals Against',
#     'Home Avg Players Score',
#        'Away Avg Players Score', 'Home Star Player Count',
#        'Away Star Player Count',
#     'xScore', 
    'xG Home', 'xG Away',
    'home_elo', 'away_elo', 
    'xScoreElo',
    'B365H', 'B365D', 'B365A',
]

df = df.drop(columns=columns_to_drop)

# Create dummy variables for 'Home' and 'Away' columns
# df = pd.get_dummies(df, columns=['Home', 'Away'], drop_first=True)


In [82]:
df.columns

Index(['Score', 'xScore', 'Home Avg Points', 'Away Avg Points',
       'Home Avg Goals For', 'Away Avg Goals For', 'Home Avg Goals Against',
       'Away Avg Goals Against', 'Home Matches Played', 'Away Matches Played',
       'Home Points/Match', 'Away Points/Match', 'Home Form Points',
       'Away Form Points', 'Home Form Goals For', 'Away Form Goals For',
       'Home Form Goals Against', 'Away Form Goals Against',
       'Home Head-to-Head Points', 'Away Head-to-Head Points',
       'Home Head-to-Head Goals For', 'Away Head-to-Head Goals For',
       'Home Head-to-Head Goals Against', 'Away Head-to-Head Goals Against',
       'Home Avg Players Score', 'Away Avg Players Score',
       'Home Star Player Count', 'Away Star Player Count'],
      dtype='object')

In [85]:
# Separate features and target
X = df.drop('Score', axis=1)
y = df['Score']

# Split the data by using the last 20% as test data
train_size = int(0.8 * len(df))
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.51


In [84]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Load the data
file_path = '../output/matches_with_players.csv'
df = pd.read_csv(file_path)

# Drop unnecessary columns
columns_to_drop = [
    'Game ID', 'Date', 'G Home', 'G Away', 'Players Found %',
    'Match Link', 'id', 'Wk', 'Day', 'Time', 'League', 'Season',
    'Home Points', 'Away Points', 'Home', 'Away'
    # Consider including some of these features if they seem useful
    # 'Home', 'Away', 'Home Avg Points', 'Away Avg Points', 'Home Avg Goals For', 'Away Avg Goals For',
    # 'Home Avg Goals Against', 'Away Avg Goals Against', 'Home Matches Played', 'Away Matches Played',
    # 'Home Points/Match', 'Away Points/Match', 'Home Form Points', 'Away Form Points',
    # 'Home Form Goals For', 'Away Form Goals For', 'Home Form Goals Against', 'Away Form Goals Against',
    # 'Home Head-to-Head Points', 'Away Head-to-Head Points', 'Home Head-to-Head Goals For',
    # 'Away Head-to-Head Goals For', 'Home Head-to-Head Goals Against', 'Away Head-to-Head Goals Against',
    # 'Home Avg Players Score', 'Away Avg Players Score', 'Home Star Player Count', 'Away Star Player Count',
    # 'xScore', 'xG Home', 'xG Away'
]

df = df.drop(columns=columns_to_drop)

# Create dummy variables for 'Home' and 'Away' columns if needed
# df = pd.get_dummies(df, columns=['Home', 'Away'], drop_first=True)

# Separate features and target
X = df.drop('Score', axis=1)
y = df['Score']

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data by using the last 20% as test data
train_size = int(0.8 * len(df))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Train the classifier
grid_search.fit(X_train, y_train)

# Get the best model
best_clf = grid_search.best_estimator_

# Make predictions
y_pred = best_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.58
