# Premier League 2023-2024 prediction

## Importing libraries

In [None]:
import pandas as pd
import openpyxl

In [None]:
# Importing the dataset and creating a dataframe
df =  pd.read_excel('all-euro-data-2023-2024.xlsx', sheet_name='E0' )
df.head()

In [None]:
# convert the dataframe to csv file
df.to_csv('premier_league.csv')
df_new = pd.read_csv('premier_league.csv')
df_new.head()

In [None]:
# removing the first column
df_new.drop(df_new.columns[0], axis=1, inplace=True)
df_new.head()

In [None]:
# save the new dataframe to csv file 
df_new.to_csv('premier_league.csv')
df_new

## Data Preprocessing

In [None]:
# checking for missing values
df_new.isnull().sum()


In [None]:
# checking for duplicates
df_new.duplicated().sum()


In [None]:
# checking for outliers
import matplotlib.pyplot as plt
df_new.boxplot(figsize=(12,8))


## Performing Exploratory Data Analysis

In [None]:
# checking the total matches played
total_matches = df_new.shape[0]
print('Total matches played:', total_matches)

#checking the list of all teams involved
teams = df_new['HomeTeam'].unique()
print('Teams involved:', teams)

# checking the total number of goals scored
total_goals = df_new['FTHG'].sum() + df_new['FTAG'].sum()
print('Total goals scored so far:', total_goals)

# checking the total number of home wins for chelsea
chelsea_home_wins = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]
print('Total number of home wins for Chelsea:', chelsea_home_wins)

# checking the total number of home losses for manchester united
man_united_home_losses = df_new[(df_new['HomeTeam'] == 'Man United') & (df_new['FTR'] == 'A')].shape[0]
print('Total number of home losses for this shitty man utd team is: ', man_united_home_losses)

In [None]:
# get the current position and calculate points for chelsea and the rest of the teams
def calculate_points_goals(row):
    home_points, away_points = 0, 0
    home_goal_diff = row['FTHG'] - row['FTAG']
    away_goal_diff = row['FTAG'] - row['FTHG']

    if row['FTR'] == 'H':
        home_points = 3
    elif row['FTR'] == 'A':
        away_points = 3
    else:
        home_points = away_points = 1

    return pd.Series([home_points, away_points, home_goal_diff, away_goal_diff, row['FTHG'], row['FTAG']])

# Applying the function to the dataset
df_new[['HomePoints', 'AwayPoints', 'HomeGoalDiff', 'AwayGoalDiff', 'HomeGoals', 'AwayGoals']] = df_new.apply(calculate_points_goals, axis=1)

# Summarizing the data for each team
team_stats = pd.DataFrame(index=teams)

# Calculating total points, goal difference, and goals scored for each team
team_stats['Points'] = df_new.groupby('HomeTeam')['HomePoints'].sum() + df_new.groupby('AwayTeam')['AwayPoints'].sum()
team_stats['GoalDiff'] = df_new.groupby('HomeTeam')['HomeGoalDiff'].sum() + df_new.groupby('AwayTeam')['AwayGoalDiff'].sum()
team_stats['GoalsScored'] = df_new.groupby('HomeTeam')['HomeGoals'].sum() + df_new.groupby('AwayTeam')['AwayGoals'].sum()

# Sorting the teams based on Points, Goal Difference, and Goals Scored
sorted_teams = team_stats.sort_values(by=['Points', 'GoalDiff', 'GoalsScored'], ascending=[False, False, False])

# Finding Chelsea's position
chelsea_position_updated = sorted_teams.index.get_loc('Chelsea') + 1
print('Chelsea is currently in position:', chelsea_position_updated)
sorted_teams.head(), chelsea_position_updated


   

### Analysing Chelsea's performance

In [None]:
# team performance overview
Total_matches = df_new[(df_new['HomeTeam'] == 'Chelsea') | (df_new['AwayTeam'] == 'Chelsea')].shape[0]
Total_matches

# total number of goals scored by chelsea
Total_goals_scored = df_new[(df_new['HomeTeam']== 'Chelsea')]['FTHG'].sum() + df_new[(df_new['HomeTeam'] == 'Chelsea')]['FTAG'].sum()
Total_goals_scored

# total number of goals conceded by chelsea
Total_goals_conceded = df_new[(df_new['HomeTeam'] == 'Chelsea')]['FTAG'].sum() + df_new[(df_new['AwayTeam'] == 'Chelsea')]['FTHG'].sum()
Total_goals_conceded

# total number of home wins for chelsea
Total_home_wins = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]

# total number of home losses for chelsea
Total_home_losses = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'A')].shape[0]

# total number of away wins for chelsea
Total_away_wins = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'A')].shape[0]

# total number of away losses for chelsea
Total_away_losses = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'H')].shape[0]

# total number of home draws for chelsea
Total_home_draws = df_new[(df_new['HomeTeam'] == 'Chelsea') & (df_new['FTR'] == 'D')].shape[0]

# total number of away draws for chelsea
Total_away_draws = df_new[(df_new['AwayTeam'] == 'Chelsea') & (df_new['FTR'] == 'D')].shape[0]

print(f'Chelsea has played a total of {Total_matches} matches so far in the 2023-2024 season')
print(f'Chelsea has scored a total of {Total_goals_scored} goals so far in the 2023-2024 season')
print(f'Chelsea has conceded a total of {Total_goals_conceded} goals so far in the 2023-2024 season')
print(f'Chelsea has won a total of {Total_home_wins} home matches so far in the 2023-2024 season')
print(f'Chelsea has lost a total of {Total_home_losses} home matches so far in the 2023-2024 season')
print(f'Chelsea has won a total of {Total_away_wins} away matches so far in the 2023-2024 season')
print(f'Chelsea has lost a total of {Total_away_losses} away matches so far in the 2023-2024 season')
print(f'Chelsea has drawn a total of {Total_home_draws} home matches so far in the 2023-2024 season')
print(f'Chelsea has drawn a total of {Total_away_draws} away matches so far in the 2023-2024 season')



### Analysing the top 6 teams

In [None]:
# top 6 teams
top_6_teams = sorted_teams.head(6)
top_6_teams

### feature engineering

In [None]:
# selecting the features for the model
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encoding Team Names
label_encoder = LabelEncoder()
df_new['HomeTeamEncoded'] = label_encoder.fit_transform(df_new['HomeTeam'])
df_new['AwayTeamEncoded'] = label_encoder.transform(df_new['AwayTeam'])

# Calculate Team's Average Goals Scored and Conceded per Match
# Replace 'FTHG' and 'FTAG' with your actual column names for goals
average_goals_scored_home = df_new.groupby('HomeTeam')['FTHG'].mean()
average_goals_scored_away = df_new.groupby('AwayTeam')['FTAG'].mean()

# Mapping these averages to the main DataFrame
df_new['HomeTeamAvgGoals'] = df_new['HomeTeam'].map(average_goals_scored_home)
df_new['AwayTeamAvgGoals'] = df_new['AwayTeam'].map(average_goals_scored_away)

# Viewing the modified DataFrame
print(df_new.head())


In [None]:
# Calculate Home Team Win Percentage
home_team_wins = df_new[df_new['FTR'] == 'H'].groupby('HomeTeam').size()
total_home_matches = df_new.groupby('HomeTeam').size()
home_team_win_percentage = home_team_wins / total_home_matches

# Calculate Away Team Win Percentage
away_team_wins = df_new[df_new['FTR'] == 'A'].groupby('AwayTeam').size()
total_away_matches = df_new.groupby('AwayTeam').size()
away_team_win_percentage = away_team_wins / total_away_matches

# Create mappings for win percentages
home_team_win_percentage_map = home_team_win_percentage.to_dict()
away_team_win_percentage_map = away_team_win_percentage.to_dict()

# Map the win percentages to the original DataFrame
df_new['HomeTeamWinPercentage'] = df_new['HomeTeam'].map(home_team_win_percentage_map).fillna(0)
df_new['AwayTeamWinPercentage'] = df_new['AwayTeam'].map(away_team_win_percentage_map).fillna(0)

# Check the first few rows to confirm the new features
print(df_new[['HomeTeam', 'HomeTeamWinPercentage', 'AwayTeam', 'AwayTeamWinPercentage']].head())


In [108]:
# Ensure the DataFrame is sorted by date
df_new.sort_values('Date', inplace=True)

# Initialize columns for points and form
df_new['HomeTeamPoints'] = 0
df_new['AwayTeamPoints'] = 0
df_new['HomeTeamRecentForm'] = 0
df_new['AwayTeamRecentForm'] = 0

# Calculate points for each team
for team in df_new['HomeTeam'].unique():
    # Calculate points for the home team
    home_points = df_new[df_new['HomeTeam'] == team]['FTR'].apply(lambda x: 3 if x == 'H' else 1 if x == 'D' else 0).cumsum()
    away_points = df_new[df_new['AwayTeam'] == team]['FTR'].apply(lambda x: 3 if x == 'A' else 1 if x == 'D' else 0).cumsum()

    # Assign the points to the team's home and away matches
    df_new.loc[df_new['HomeTeam'] == team, 'HomeTeamPoints'] = home_points
    df_new.loc[df_new['AwayTeam'] == team, 'AwayTeamPoints'] = away_points

# Calculate recent form for each team
for team in df_new['HomeTeam'].unique():
    # Get indices of the team's home and away games
    home_indices = df_new[df_new['HomeTeam'] == team].index
    away_indices = df_new[df_new['AwayTeam'] == team].index
    
    # Calculate the form for the last 5 games, excluding the current match
    df_new.loc[home_indices, 'HomeTeamRecentForm'] = df_new.loc[home_indices, 'HomeTeamPoints'].diff().fillna(0).rolling(window=6, min_periods=1).sum().shift(fill_value=0)
    df_new.loc[away_indices, 'AwayTeamRecentForm'] = df_new.loc[away_indices, 'AwayTeamPoints'].diff().fillna(0).rolling(window=6, min_periods=1).sum().shift(fill_value=0)

# Display the head of the dataframe to verify
df_new[['Date', 'HomeTeam', 'HomeTeamPoints', 'HomeTeamRecentForm', 'AwayTeam', 'AwayTeamPoints', 'AwayTeamRecentForm']].tail(10)



Unnamed: 0,Date,HomeTeam,HomeTeamPoints,HomeTeamRecentForm,AwayTeam,AwayTeamPoints,AwayTeamRecentForm
159,2023-12-10,Tottenham,15,9,Newcastle,5,5
160,2023-12-15,Nott'm Forest,9,6,Tottenham,18,11
161,2023-12-16,Chelsea,12,5,Sheffield United,1,1
162,2023-12-16,Man City,18,14,Crystal Palace,12,7
163,2023-12-16,Newcastle,24,18,Fulham,6,2
164,2023-12-16,Burnley,3,3,Everton,16,13
165,2023-12-17,Arsenal,23,16,Brighton,10,7
166,2023-12-17,Brentford,12,10,Aston Villa,14,8
167,2023-12-17,West Ham,14,8,Wolves,7,4
168,2023-12-17,Liverpool,22,18,Man United,13,12


In [109]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Selecting features and target
X = df_new[['HomeTeamEncoded', 'AwayTeamEncoded', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamPoints', 'AwayTeamPoints', 'HomeTeamRecentForm', 'AwayTeamRecentForm',
    'HomeTeamWinPercentage', 'AwayTeamWinPercentage']]
y = df_new['FTR']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Addressing Class Imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning
model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train_scaled, y_train_smote)

# Best Model Evaluation
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           A       0.43      0.77      0.56        13
           D       0.60      0.43      0.50        14
           H       0.89      0.67      0.76        24

    accuracy                           0.63        51
   macro avg       0.64      0.62      0.61        51
weighted avg       0.69      0.63      0.64        51


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define a list with models to evaluate
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Support Vector Machine', SVC(random_state=42)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# Record model performance
model_performance = {}

# Loop through models
for name, model in models:
    # Train the model
    model.fit(X_train_scaled, y_train_smote)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Record performance
    model_performance[name] = classification_report(y_test, y_pred, output_dict=True)

# Print the performance for each model
for name, performance in model_performance.items():
    print(f"Model: {name}")
    print(pd.DataFrame(performance).transpose())


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel



# Assuming that the necessary preprocessing steps have been done and
# 'HomeTeamEncoded', 'AwayTeamEncoded', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals' are the engineered features
X = df_new[['HomeTeamEncoded', 'AwayTeamEncoded', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals']]
y = df_new['FTR']  # Target variable

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature selection using RandomForestClassifier
sel = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
sel.fit(X_train, y_train)

# Get the features considered important by the model
selected_feat = X_train.columns[(sel.get_support())]
print('Selected features:', selected_feat)

# Now let's train a model using only the selected features
X_train_selected = sel.transform(X_train)
X_test_selected = sel.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_selected, y_train)

# Evaluate the model (using the selected features)
y_pred = model.predict(X_test_selected)
print(classification_report(y_test, y_pred))

