In [228]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
# import seaborn as sns

df = pd.read_csv("PremierLeagueMatches.csv")

df

Unnamed: 0,Matchday,Date,Time,Home Team,homeScore,homeXG,awayScore,awayXG,Away Team,Attendance,Referee,Stadium,Result,*Additional Stats
0,1,2022-08-05,20:00,Crystal Palace,0.0,1.2,2.0,1.0,Arsenal,25286,Anthony Taylor,Selhurst Park,A,https://fbref.com//en/matches/e62f6e78/Crystal...
1,1,2022-08-06,12:30,Fulham,2.0,1.2,2.0,1.2,Liverpool,22207,Andy Madley,Craven Cottage,D,https://fbref.com//en/matches/6713c1dc/Fulham-...
2,1,2022-08-06,15:00,Tottenham,4.0,1.5,1.0,0.5,Southampton,61732,Andre Marriner,Tottenham Hotspur Stadium,H,https://fbref.com//en/matches/09d8a999/Tottenh...
3,1,2022-08-06,15:00,Newcastle Utd,2.0,1.7,0.0,0.3,Nott'ham Forest,52245,Simon Hooper,St James' Park,H,https://fbref.com//en/matches/1ac96eb4/Newcast...
4,1,2022-08-06,15:00,Leeds United,2.0,0.8,1.0,1.3,Wolves,36347,Robert Jones,Elland Road,H,https://fbref.com//en/matches/82702941/Leeds-U...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,38,2025-05-25,16:00,Fulham,,,,,Manchester City,,,Craven Cottage,,
1136,38,2025-05-25,16:00,Nott'ham Forest,,,,,Chelsea,,,The City Ground,,
1137,38,2025-05-25,16:00,Manchester Utd,,,,,Aston Villa,,,Old Trafford,,
1138,38,2025-05-25,16:00,Wolves,,,,,Brentford,,,Molineux Stadium,,


### Test

### Fill null data

In [229]:
# df[df.isnull().any(axis=1)][['homeXG', 'awayXG']]
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df[['homeXG','awayXG']] = imputer.fit_transform(df[['homeXG','awayXG']])


### Prepare Data

In [230]:
# Cleaning
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

df['Attendance'] = df['Attendance'].str.replace(',', '').astype(float)
df['HomeWin'] = df['Result'].apply(lambda x: 0 if x == 'H' else 1 )
# df['Referee'] = encoder.fit_transform(df['Referee'])

# Select features
features = ['homeXG', 'awayXG','Attendance']
target = ['HomeWin']

df_cleaned = df[features + target].dropna()


### Training Data

In [231]:
# Train-test split
X = df_cleaned[features]
y = df_cleaned[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
model = DecisionTreeClassifier(criterion='entropy',max_depth=4,splitter='best',random_state=42)
model.fit(X_train, y_train)

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Evaluation

In [232]:
# # Predict
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


# Show 5 incorrect predictions
X_test_results = X_test.copy()
X_test_results["Actual"] = y_test
X_test_results["Predicted"] = y_pred
incorrect_preds = X_test_results[X_test_results["Actual"] != X_test_results["Predicted"]]
print("\n5 Incorrect Predictions:")
print(incorrect_preds.head(5))

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.59      0.63        96
           1       0.68      0.74      0.71       113

    accuracy                           0.67       209
   macro avg       0.67      0.67      0.67       209
weighted avg       0.67      0.67      0.67       209


5 Incorrect Predictions:
     homeXG  awayXG  Attendance  Actual  Predicted
459     0.4     0.5     60233.0       0          1
550     0.9     0.6     41651.0       1          0
584     1.2     1.2     17077.0       0          1
612     1.6     1.8     24271.0       0          1
794     0.9     0.4     60344.0       1          0


### Chart of Decision Tree

## Prediction

In [235]:
df2 = pd.read_csv('./xG7.csv')
home = "Manchester Utd"
away = "Sunderland"

# get xg score by team
get_xg = lambda team: (df2[df2['team'] == team]['xg']).values[0]

homeXG = get_xg(home)
awayXG = get_xg(away)

label = ['homeXG','awayXG','Attendance']
data = [homeXG, awayXG, 1_123]

unseen_df = pd.DataFrame([data], columns=label)
y_pred = model.predict(unseen_df)
y_pred

array([1])