In [589]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
# import seaborn as sns

df = pd.read_csv("final_matches.csv")


### Test

### Fill null data

In [590]:
# # df[df.isnull().any(axis=1)][['homeXG', 'awayXG']]
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
df[['xg','xga', 'attendance']] = imputer.fit_transform(df[['xg','xga','attendance']])


### Encoder Referee

In [591]:
referee = sorted(df['referee'].unique())
dict_ref = {}

df['ref_encode'] = 0
# เก็บค่า ชื่อ กรรมการและค่า encode ลงไป
for i, v in enumerate(referee):
    dict_ref[v] = i
    index = df[df['referee'] == v].index
    df.loc[index, 'ref_encode'] = i
    
    
df[['referee', 'ref_encode']].sort_values(by='referee')



# Convert dict_ref to a DataFrame with columns 'referee' and 'code'
# df_ref = pd.DataFrame(list(dict_ref.items()), columns=['referee', 'code'])
# df_ref.to_csv('referee_encoding.csv', index=False)

Unnamed: 0,referee,ref_encode
2502,Andre Marriner,0
753,Andre Marriner,0
2825,Andre Marriner,0
2461,Andre Marriner,0
3472,Andre Marriner,0
...,...,...
1623,Tony Harrington,35
3664,Tony Harrington,35
1499,Tony Harrington,35
3238,Tony Harrington,35


### Prepare Data

In [592]:
# Cleaning
df['result_encode'] = df['result'].apply(lambda x: 0 if x == "L" else 1 if x == "W" else 2)

# Select features
features = ['xg', 'xga','ref_encode']
target = ['result']


### Training Data

In [593]:

# Train-test split
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Classifier
model = DecisionTreeClassifier(criterion='entropy',max_depth=6,splitter='best', random_state=10)
model.fit(X_train, y_train)


0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,6
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Evaluation

In [594]:
# Predict
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           D       0.52      0.07      0.12       182
           L       0.57      0.74      0.64       268
           W       0.60      0.76      0.67       310

    accuracy                           0.59       760
   macro avg       0.56      0.52      0.48       760
weighted avg       0.57      0.59      0.53       760

0.5855263157894737


### Chart of Decision Tree

## Prediction

In [None]:
df2 = pd.read_csv('./xG7.csv')
home = "Manchester Utd"
away = "Sunderland"

# get xg score by team
get_xg = lambda team: (df2[df2['team'] == team]['xg']).values[0]

homeXG = get_xg(home)
awayXG = get_xg(away)

label = ['xg','xga','ref_encode']
data = [homeXG, awayXG, 31]

unseen_df = pd.DataFrame([data], columns=label)
y_pred = model.predict(unseen_df)
prob = model.predict_proba(unseen_df)
columns = model.classes_
probs_df = pd.DataFrame(prob, columns=columns)


### Export Model

In [597]:
import pickle
f = open('decisiontree.pkl', 'wb')
pickle.dump(model, f)
f.close()