In [189]:
import pandas as pd
import numpy as np
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [190]:
df = pd.read_csv('football.csv')
df

Unnamed: 0,matchId,playerId,playType,bodyPart,x,y,interveningOpponents,interveningTeammates,interferenceOnShooter,minute,second,outcome
0,m_91,p_103,open play,right foot,13.47,-11.22,1,0,medium,70,9,goal
1,m_17,p_16,open play,left foot,9.48,14.22,3,0,medium,55,4,saved by keeper
2,m_111,p_88,free kick,left foot,29.43,-1.25,6,2,low,86,31,saved by keeper
3,m_142,p_87,open play,right foot,26.93,1.00,4,1,medium,77,2,missed chance
4,m_117,p_9,open play,right foot,10.72,5.24,2,0,medium,76,46,goal
...,...,...,...,...,...,...,...,...,...,...,...,...
8920,m_57,p_115,open play,head,6.48,3.99,3,0,high,69,50,missed chance
8921,m_59,p_76,open play,right foot,21.45,-8.73,4,1,medium,15,53,blocked shot
8922,m_55,p_150,open play,left foot,11.97,3.24,3,0,medium,84,34,missed chance
8923,m_33,p_130,open play,right foot,6.48,-6.98,1,0,high,4,39,missed chance


In [191]:
df.head()

Unnamed: 0,matchId,playerId,playType,bodyPart,x,y,interveningOpponents,interveningTeammates,interferenceOnShooter,minute,second,outcome
0,m_91,p_103,open play,right foot,13.47,-11.22,1,0,medium,70,9,goal
1,m_17,p_16,open play,left foot,9.48,14.22,3,0,medium,55,4,saved by keeper
2,m_111,p_88,free kick,left foot,29.43,-1.25,6,2,low,86,31,saved by keeper
3,m_142,p_87,open play,right foot,26.93,1.0,4,1,medium,77,2,missed chance
4,m_117,p_9,open play,right foot,10.72,5.24,2,0,medium,76,46,goal


In [192]:
df.describe()

Unnamed: 0,x,y,interveningOpponents,interveningTeammates,minute,second
count,8925.0,8925.0,8925.0,8925.0,8925.0,8925.0
mean,14.564631,0.364408,2.531989,0.336471,48.541401,29.24381
std,7.841865,8.999063,1.386693,0.680175,26.397998,17.306565
min,0.0,-33.92,0.0,0.0,0.0,0.0
25%,8.48,-5.74,2.0,0.0,26.0,14.0
50%,12.47,0.5,2.0,0.0,49.0,29.0
75%,20.95,6.73,3.0,0.0,71.0,44.0
max,70.82,33.92,11.0,7.0,98.0,59.0


In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8925 entries, 0 to 8924
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   matchId                8925 non-null   object 
 1   playerId               8925 non-null   object 
 2   playType               8925 non-null   object 
 3   bodyPart               8925 non-null   object 
 4   x                      8925 non-null   float64
 5   y                      8925 non-null   float64
 6   interveningOpponents   8925 non-null   int64  
 7   interveningTeammates   8925 non-null   int64  
 8   interferenceOnShooter  8891 non-null   object 
 9   minute                 8925 non-null   int64  
 10  second                 8925 non-null   int64  
 11  outcome                8891 non-null   object 
dtypes: float64(2), int64(4), object(6)
memory usage: 836.8+ KB


handle missing values by filling them with the mode (most common value)

In [194]:
common_interference = df['interferenceOnShooter'].mode()[0]
common_outcome = df['outcome'].mode()[0]

df['interferenceOnShooter'].fillna(common_interference, inplace=True)
df['outcome'].fillna(common_outcome, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['interferenceOnShooter'].fillna(common_interference, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['outcome'].fillna(common_outcome, inplace=True)


In [195]:
for column in df.columns:
    print(f"{column}:\n {df[column].unique()}")
    print("-----------------------------------------------------------------------")

matchId:
 ['m_91' 'm_17' 'm_111' 'm_142' 'm_117' 'm_147' 'm_88' 'm_66' 'm_132'
 'm_99' 'm_101' 'm_158' 'm_11' 'm_204' 'm_108' 'm_169' 'm_7' 'm_27' 'm_1'
 'm_68' 'm_203' 'm_124' 'm_186' 'm_140' 'm_3' 'm_235' 'm_209' 'm_231'
 'm_81' 'm_146' 'm_110' 'm_59' 'm_61' 'm_134' 'm_92' 'm_114' 'm_62' 'm_9'
 'm_144' 'm_96' 'm_184' 'm_177' 'm_154' 'm_143' 'm_148' 'm_196' 'm_51'
 'm_202' 'm_216' 'm_187' 'm_240' 'm_10' 'm_33' 'm_86' 'm_26' 'm_93'
 'm_149' 'm_131' 'm_139' 'm_65' 'm_25' 'm_167' 'm_224' 'm_226' 'm_238'
 'm_179' 'm_217' 'm_56' 'm_121' 'm_191' 'm_29' 'm_198' 'm_107' 'm_155'
 'm_173' 'm_74' 'm_208' 'm_237' 'm_67' 'm_137' 'm_145' 'm_80' 'm_200'
 'm_102' 'm_133' 'm_205' 'm_57' 'm_156' 'm_30' 'm_161' 'm_54' 'm_98'
 'm_125' 'm_39' 'm_4' 'm_44' 'm_141' 'm_206' 'm_195' 'm_22' 'm_20' 'm_69'
 'm_160' 'm_122' 'm_201' 'm_112' 'm_94' 'm_236' 'm_165' 'm_233' 'm_182'
 'm_164' 'm_213' 'm_6' 'm_63' 'm_55' 'm_89' 'm_222' 'm_178' 'm_48' 'm_46'
 'm_35' 'm_136' 'm_28' 'm_168' 'm_32' 'm_18' 'm_95' 'm_36' 'm_4

In [196]:
unique_values = ['goal', 'saved by keeper', 'missed chance', 'blocked shot', 'hit the post']
for value in unique_values:
    if value == 'goal': 
        df['outcome'].replace(value, 1, inplace=True)
    elif value == 'own goal': #there is no 'own goal' in dataset
        df['outcome'].replace(value, 1, inplace=True)
    else:
        df['outcome'].replace(value, 0, inplace=True)

  df['outcome'].replace(value, 0, inplace=True)


In [197]:
df.rename(columns={"outcome": "scoredGoal"}, inplace=True)

In [198]:
x_train,x_test , y_train,y_test = train_test_split(df.drop('scoredGoal', axis=1), df.scoredGoal, random_state=110, stratify=df.scoredGoal)
model = AutoML()
model.fit(x_train, y_train, task='classification', time_budget=60, verbose=0)
y_pred = model.predict(x_test)
print(f'performance of model is {roc_auc_score(y_test, y_pred)}')



performance of model is 0.5884811416921508


In [199]:
df.drop('matchId', inplace=True, axis=1)
df.drop('playerId', inplace=True, axis=1)
df

Unnamed: 0,playType,bodyPart,x,y,interveningOpponents,interveningTeammates,interferenceOnShooter,minute,second,scoredGoal
0,open play,right foot,13.47,-11.22,1,0,medium,70,9,1
1,open play,left foot,9.48,14.22,3,0,medium,55,4,0
2,free kick,left foot,29.43,-1.25,6,2,low,86,31,0
3,open play,right foot,26.93,1.00,4,1,medium,77,2,0
4,open play,right foot,10.72,5.24,2,0,medium,76,46,1
...,...,...,...,...,...,...,...,...,...,...
8920,open play,head,6.48,3.99,3,0,high,69,50,0
8921,open play,right foot,21.45,-8.73,4,1,medium,15,53,0
8922,open play,left foot,11.97,3.24,3,0,medium,84,34,0
8923,open play,right foot,6.48,-6.98,1,0,high,4,39,0


In [200]:
df['distance'] = np.sqrt((df['x'] ** 2) + (df['y'] ** 2))

goal_width = 7.32  # Width of the goal in meters
#df['theta'] = np.arctan2(goal_width * df['x'], (df['x']**2 + df['y']**2 - (goal_width / 2)**2))

df['angle'] = np.degrees(np.arctan2(goal_width * df['x'], (df['x']**2 + df['y']**2 - (goal_width / 2)**2)))#(df['theta'])

df.drop('x', inplace=True, axis=1)
df.drop('y', inplace=True, axis=1)
df

Unnamed: 0,playType,bodyPart,interveningOpponents,interveningTeammates,interferenceOnShooter,minute,second,scoredGoal,distance,angle
0,open play,right foot,1,0,medium,70,9,1,17.530810,18.544088
1,open play,left foot,3,0,medium,55,4,0,17.090313,13.982592
2,free kick,left foot,6,2,low,86,31,0,29.456534,14.153255
3,open play,right foot,4,1,medium,77,2,0,26.948560,15.458384
4,open play,right foot,2,0,medium,76,46,1,11.932141,31.315918
...,...,...,...,...,...,...,...,...,...,...
8920,open play,head,3,0,high,69,50,0,7.609895,46.818116
8921,open play,right foot,4,1,medium,15,53,0,23.158484,16.713121
8922,open play,left foot,3,0,medium,84,34,0,12.400746,31.970470
8923,open play,right foot,1,0,high,4,39,0,9.524222,31.529506


In [201]:
label_encoders = {}
for column in ['playType', 'bodyPart', 'interferenceOnShooter', 'scoredGoal']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

features = ['playType', 'bodyPart', 'interveningOpponents', 
            'interveningTeammates', 'interferenceOnShooter', 'minute', 'second', 'distance', 'angle']

mi_scores = mutual_info_classif(df[features], df['scoredGoal'], discrete_features=False)
temp = df.drop(['scoredGoal'], axis = 1)
index = [col for col in list(temp.columns)]
mi_results = pd.DataFrame({
    #'Feature': features,
    'Importance': mi_scores
}, index = index)

mi_results_sorted = mi_results.sort_values(by='Importance', ascending=False)
mi_results_sorted

Unnamed: 0,Importance
angle,0.055186
distance,0.050243
interveningOpponents,0.038232
playType,0.020761
interferenceOnShooter,0.007941
interveningTeammates,0.006037
minute,0.005343
bodyPart,0.002734
second,0.000607


In [202]:
model = AutoML(task='classification', time_budget=60, verbose=0)
columns_to_train = mi_results_sorted[mi_results_sorted.Importance >= mi_results_sorted.Importance.quantile(.5)].index
x = df[columns_to_train]
y = df.scoredGoal
model.fit(x,y)
x_train,x_test , y_train,y_test = train_test_split(x,y, random_state=110, test_size=.3, stratify=y)
y_pred = model.predict(x_test)
print(f'performance of model is {roc_auc_score(y_test, y_pred)}')



performance of model is 0.597504116974522
