In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
data=pd.read_csv('../fixtures.csv')
data_4=pd.read_csv('../standings.csv')
data_5=pd.read_csv('../team_salary.csv')
data_6=pd.read_csv('../team_stats.csv')

In [3]:
data['result']=np.where(data['HomeScore']==data['AwayScore'],0,np.where(data['HomeScore']>data['AwayScore'],1,2))# drow = 0 , homewin=1 , awaywin=2

In [4]:
data['Date'] = pd.to_datetime(data['Date'])

data['Time'] = pd.to_datetime(data['Time'])

data['day_of_week'] = data['Date'].dt.dayofweek

data['is_weekend']=np.where(data['day_of_week'].isin([5,6]),1,0)

  data['Time'] = pd.to_datetime(data['Time'])


In [5]:
data['Attendance']=data['Attendance'].fillna(data['Attendance'].mean())

In [6]:
label_encoder=LabelEncoder()
data['Venue_n']=label_encoder.fit_transform(data['Venue'])
data['Referee_n']=label_encoder.fit_transform(data['Referee'])
data_4['numeric_name']=label_encoder.fit_transform(data_4['team'])

with open('../models/label_encoder.pkl','wb')as f:
    pickle.dump(label_encoder,f)

    
teams=pd.DataFrame(data_4[['numeric_name','team']])
with open('../models/teams.pkl','wb')as f:
    pickle.dump(teams,f)

venue=pd.DataFrame(data[['Venue_n','Venue']])
with open ('../models/venue.pkl','wb')as f:
    pickle.dump(venue,f)

In [7]:
data_5['team'] = data_5['team'].str.strip().str.lower()

data_6['team'] = data_6['team'].str.strip().str.lower()

merged_d5_d6 = pd.merge(
    left=data_6[['team', 'players', 'age', 'possession', 'goals', 'assists','penalty_kicks', 'penalty_kick_attempts', 'yellows', 'reds',
       'expected_goals', 'expected_assists', 'progressive_carries','progressive_passes']],
    right=data_5[['team','weekly']],
    
    left_on=['team'],
    
    right_on=['team'],
    
    how='left'
)

In [8]:
data_4['team'] = data_4['team'].str.strip().str.lower()

merged_d5_d6['team'] = merged_d5_d6['team'].str.strip().str.lower()


merged_d4_d5_d6 = pd.merge(
    left=merged_d5_d6[['team', 'players', 'age', 'possession', 'goals', 'assists','penalty_kicks', 'penalty_kick_attempts', 'yellows', 'reds',
       'expected_goals', 'expected_assists', 'progressive_carries','progressive_passes', 'weekly']],
    right=data_4[['team','rank',  'win', 'loss', 'draw', 'goals', 'conceded', 'points','numeric_name']],
    
    left_on=['team'],
    
    right_on=['team'],
    
    how='left' 
)

In [9]:
with open('../models/statistics.pkl','wb')as f:
    pickle.dump(merged_d4_d5_d6,f)

In [10]:
data['Home'] = data['Home'].str.strip().str.lower()

data['Away'] = data['Away'].str.strip().str.lower()

team_mapping = data_4[['team', 'numeric_name']].copy()

data = data.merge(
    team_mapping.rename(columns={'team': 'Home', 'numeric_name': 'home_numeric_name'}),
    on='Home',
    how='left'
)
data = data.merge(
    team_mapping.rename(columns={'team': 'Away', 'numeric_name': 'away_numeric_name'}),
    on='Away',
    how='left'
)

In [11]:
base_data = data[[
    'Date', 'Time', 'day_of_week',
    'Home', 'Away',
    'home_numeric_name', 'away_numeric_name',
    'Venue_n', 'Attendance', 'Referee_n',
    'is_weekend', 'result'
]]

team_stats = merged_d4_d5_d6[[
    'numeric_name',
    'players', 'age', 'possession',
    'goals_x', 'assists', 'penalty_kicks', 'penalty_kick_attempts',
    'yellows', 'reds', 'expected_goals', 'expected_assists',
    'progressive_carries', 'progressive_passes', 'weekly',
    'rank', 'win', 'loss', 'draw', 'goals_y', 'conceded', 'points'
]].copy()

final_data = base_data.merge(
    team_stats,
    left_on='home_numeric_name',
    right_on='numeric_name',
    how='left',
    suffixes=('', '_home')  # إضافة _home للأعمدة المكررة
)

home_columns_rename = {
    'players': 'home_players',
    'age': 'home_age',
    'possession': 'home_possession',
    'goals_x': 'home_goals_scored',
    'assists': 'home_assists',
    'penalty_kicks': 'home_penalty_kicks',
    'penalty_kick_attempts': 'home_penalty_attempts',
    'yellows': 'home_yellows',
    'reds': 'home_reds',
    'expected_goals': 'home_xG',
    'expected_assists': 'home_xA',
    'progressive_carries': 'home_prog_carries',
    'progressive_passes': 'home_prog_passes',
    'weekly': 'home_weekly_salary',
    'rank': 'home_rank',
    'win': 'home_wins',
    'loss': 'home_losses',
    'draw': 'home_draws',
    'goals_y': 'home_goals_total',
    'conceded': 'home_conceded',
    'points': 'home_points'
}

final_data.rename(columns=home_columns_rename, inplace=True)

final_data.drop('numeric_name', axis=1, inplace=True)

final_data = final_data.merge(
    team_stats,
    left_on='away_numeric_name',
    right_on='numeric_name',
    how='left',
    suffixes=('', '_away')
)

# إعادة تسمية الأعمدة للفريق الضيف
away_columns_rename = {
    'players': 'away_players',
    'age': 'away_age',
    'possession': 'away_possession',
    'goals_x': 'away_goals_scored',
    'assists': 'away_assists',
    'penalty_kicks': 'away_penalty_kicks',
    'penalty_kick_attempts': 'away_penalty_attempts',
    'yellows': 'away_yellows',
    'reds': 'away_reds',
    'expected_goals': 'away_xG',
    'expected_assists': 'away_xA',
    'progressive_carries': 'away_prog_carries',
    'progressive_passes': 'away_prog_passes',
    'weekly': 'away_weekly_salary',
    'rank': 'away_rank',
    'win': 'away_wins',
    'loss': 'away_losses',
    'draw': 'away_draws',
    'goals_y': 'away_goals_total',
    'conceded': 'away_conceded',
    'points': 'away_points'
}

final_data.rename(columns=away_columns_rename, inplace=True)

final_data.drop('numeric_name', axis=1, inplace=True)


In [12]:
final_data=final_data[['home_numeric_name',
       'away_numeric_name', 'Venue_n', 'Attendance','home_age', 'home_possession',
       'home_goals_scored', 'home_assists', 'home_penalty_kicks',
       'home_penalty_attempts', 'home_yellows', 'home_reds', 'home_xG',
       'home_xA', 'home_prog_carries', 'home_prog_passes',
       'home_weekly_salary', 'home_rank', 'home_wins', 'home_losses',
       'home_draws', 'home_goals_total', 'home_conceded', 'home_points',
       'away_age', 'away_possession', 'away_goals_scored',
       'away_assists', 'away_penalty_kicks', 'away_penalty_attempts',
       'away_yellows', 'away_reds', 'away_xG', 'away_xA', 'away_prog_carries',
       'away_prog_passes', 'away_weekly_salary', 'away_rank', 'away_wins',
       'away_losses', 'away_draws', 'away_goals_total', 'away_conceded',
       'away_points',
       'result']]

In [13]:
with open ('../models/final_data.pkl','wb')as f:
    pickle.dump(final_data,f)

In [14]:
X=final_data.drop('result', axis=1)
y= final_data['result']

In [15]:
filters = {
    'Filter_0': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 'Attendance',
        'home_age', 'home_possession','home_goals_scored', 
        'home_assists', 'home_yellows', 'home_reds', 'home_xG','home_xA', 
        'home_prog_carries', 'home_prog_passes','home_weekly_salary',
        'home_wins', 'home_losses','home_draws','home_conceded', 'home_points',
        'away_age', 'away_possession', 'away_goals_scored',
        'away_assists','away_yellows', 'away_reds', 'away_xG', 'away_xA', 
        'away_prog_carries','away_prog_passes', 'away_weekly_salary','away_wins',
        'away_losses', 'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_1': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 
        'Attendance','home_age', 'home_possession','home_goals_scored', 
        'home_yellows', 'home_reds', 'home_xG', 
        'home_prog_carries', 'home_prog_passes','home_weekly_salary',
        'home_wins', 'home_losses','home_draws','home_conceded', 'home_points',
        'away_age', 'away_possession', 'away_goals_scored',
        'away_yellows', 'away_reds', 'away_xG','away_prog_carries','away_prog_passes', 'away_weekly_salary','away_wins',
        'away_losses', 'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_2': X[[
        'home_numeric_name','away_numeric_name','Attendance',
        'home_xG','home_wins', 'home_losses','home_draws','home_conceded', 
        'home_points','away_xG','away_wins','away_losses', 
        'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_3': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 'Attendance',
        'home_age','home_xG','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points'
    ]],
    
    'Filter_4': X[[
        'home_numeric_name','away_numeric_name',
        'home_age','home_xG','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points']],
    
    'Filter_5': X[[
        'home_numeric_name','away_numeric_name',
        'home_age','home_xG','home_prog_carries', 'home_prog_passes','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_prog_carries','away_prog_passes','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points'
    ]]
}

In [16]:
filters = {
    'Filter_0': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 'Attendance',
        'home_age', 'home_possession','home_goals_scored', 
        'home_assists', 'home_yellows', 'home_reds', 'home_xG','home_xA', 
        'home_prog_carries', 'home_prog_passes','home_weekly_salary',
        'home_wins', 'home_losses','home_draws','home_conceded', 'home_points',
        'away_age', 'away_possession', 'away_goals_scored',
        'away_assists','away_yellows', 'away_reds', 'away_xG', 'away_xA', 
        'away_prog_carries','away_prog_passes', 'away_weekly_salary','away_wins',
        'away_losses', 'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_1': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 
        'Attendance','home_age', 'home_possession','home_goals_scored', 
        'home_yellows', 'home_reds', 'home_xG', 
        'home_prog_carries', 'home_prog_passes','home_weekly_salary',
        'home_wins', 'home_losses','home_draws','home_conceded', 'home_points',
        'away_age', 'away_possession', 'away_goals_scored',
        'away_yellows', 'away_reds', 'away_xG','away_prog_carries','away_prog_passes', 'away_weekly_salary','away_wins',
        'away_losses', 'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_2': X[[
        'home_numeric_name','away_numeric_name','Attendance',
        'home_xG','home_wins', 'home_losses','home_draws','home_conceded', 
        'home_points','away_xG','away_wins','away_losses', 
        'away_draws','away_conceded','away_points'
    ]],
    
    'Filter_3': X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 'Attendance',
        'home_age','home_xG','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points'
    ]],
    
    'Filter_4': X[[
        'home_numeric_name','away_numeric_name',
        'home_age','home_xG','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points']],
    
    'Filter_5': X[[
        'home_numeric_name','away_numeric_name',
        'home_age','home_xG','home_prog_carries', 'home_prog_passes','home_wins','home_losses','home_draws','home_weekly_salary','home_goals_total', 
        'home_conceded', 'home_points','away_age',
        'away_xG','away_prog_carries','away_prog_passes','away_wins','away_losses','away_draws','away_weekly_salary','away_goals_total', 'away_conceded',
        'away_points'
    ]]
}

* choosing best parameters of Logistic Regression

In [17]:
filter_0= X[[
        'home_numeric_name','away_numeric_name', 'Venue_n', 'Attendance',
        'home_age', 'home_possession','home_goals_scored', 
        'home_assists', 'home_yellows', 'home_reds', 'home_xG','home_xA', 
        'home_prog_carries', 'home_prog_passes','home_weekly_salary',
        'home_wins', 'home_losses','home_draws','home_conceded', 'home_points',
        'away_age', 'away_possession', 'away_goals_scored',
        'away_assists','away_yellows', 'away_reds', 'away_xG', 'away_xA', 
        'away_prog_carries','away_prog_passes', 'away_weekly_salary','away_wins',
        'away_losses', 'away_draws','away_conceded','away_points'
    ]]

In [18]:
scaler=StandardScaler()

X_scaled=scaler.fit_transform(filter_0)

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler,f)
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_lr = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid_lr,
    cv=5,
    scoring='accuracy'
)

grid_lr.fit(X_scaled, y)

print(f"Best C:        {grid_lr.best_params_}")
print(f"Best CV Score: {grid_lr.best_score_}")

Best C:        {'C': 0.001}
Best CV Score: 0.5578947368421053


* choosing best parameters of Decision Tree

In [19]:
param_grid_dt = {
    'max_depth': [2, 3, 4, 5, 7, 10],
    'min_samples_split': [2, 5, 10, 20],
    'criterion': ['gini', 'entropy']
}

grid_dt = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid_dt,
    cv=5,
    scoring='accuracy'
)

grid_dt.fit(filter_0, y)

print(f"Best Params:   {grid_dt.best_params_}")
print(f"Best CV Score: {grid_dt.best_score_}")


Best Params:   {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
Best CV Score: 0.5236842105263158


* choosing best parameters of Random Forest

In [20]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [5, 10, 20]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=5,
    scoring='accuracy'
)

grid_rf.fit(filter_0, y)

print(f"Best Params:   {grid_rf.best_params_}")
print(f"Best CV Score: {grid_rf.best_score_}")

Best Params:   {'max_depth': 3, 'min_samples_split': 10, 'n_estimators': 50}
Best CV Score: 0.5368421052631579


### best models are :
* 1-Logistic Regression + Filter_3
* 2-Logistic Regression + Filter_2
* Random Forest + Filter_4

In [21]:
model_1=LogisticRegression(C=0.001,max_iter=1000)
model_2=LogisticRegression(C=0.001,max_iter=1000)
model_3=RandomForestClassifier(max_depth=3,min_samples_split= 10,n_estimators=50, random_state=42)

In [22]:
model_1_data= X[[
        'home_numeric_name','away_numeric_name','Venue_n','Attendance','home_age',
        'home_xG','home_wins', 'home_losses','home_draws','home_weekly_salary','home_goals_total','home_conceded', 
        'home_points','away_age','away_xG','away_wins','away_losses', 
        'away_draws','away_weekly_salary','away_goals_total','away_conceded','away_points'
    ]]
model_2_data=X[[
        'home_numeric_name','away_numeric_name','Attendance',
        'home_xG','home_wins','home_losses','home_draws','home_conceded', 
        'home_points','away_xG','away_wins','away_losses', 
        'away_draws','away_conceded','away_points']]

model_3_data=X[[
        'home_numeric_name','away_numeric_name','home_age',
        'home_xG','home_wins', 'home_losses','home_draws','home_weekly_salary','home_goals_total','home_conceded', 
        'home_points',
        'away_age','away_xG','away_wins','away_losses', 
        'away_draws','away_weekly_salary','away_goals_total','away_conceded','away_points']]

In [23]:
X_train,X_test,y_train,y_test=train_test_split(model_1_data,y,test_size=0.2,random_state=42)
model_1.fit(X_train,y_train)
with open('../models/model_1_lr.pkl', 'wb')as f:
    pickle.dump(model_1,f)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
X_train,X_test,y_train,y_test=train_test_split(model_2_data,y,test_size=0.2,random_state=42)
model_2.fit(X_train,y_train)
with open('../models/model_2_lr.pkl', 'wb')as f:
    pickle.dump(model_2,f)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
X_train,X_test,y_train,y_test=train_test_split(model_3_data,y,test_size=0.2,random_state=42)
model_3.fit(X_train,y_train)
with open('../models/model_3_rf.pkl', 'wb')as f:
    pickle.dump(model_3,f)