# Scottish Premier League 2023/24

In [38]:
import pandas as pd

# Load the dataset from the excel file
#file_path = 'all-euro-data-2023-2024.xlsx'
#df = pd.read_excel(file_path, sheet_name='E0')

# updating the dataset with the recent games played
df = pd.read_csv('SC0.csv')

# Display the first few rows of the dataset
df.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
104,SC0,23/12/2023,15:00,Celtic,Livingston,2,0,H,0,0,...,3.32,-2.75,1.95,1.9,1.93,1.93,2.03,1.95,1.93,1.88
105,SC0,23/12/2023,15:00,Hearts,St Mirren,2,0,H,1,0,...,1.7,-0.5,1.88,1.98,1.9,2.01,1.94,2.02,1.87,1.94
106,SC0,23/12/2023,15:00,Kilmarnock,St Johnstone,2,1,H,2,0,...,1.73,-0.75,1.83,2.03,1.88,2.03,1.9,2.08,1.82,1.99
107,SC0,24/12/2023,12:00,Motherwell,Rangers,0,2,A,0,2,...,2.34,1.5,2.03,1.83,2.07,1.85,2.11,1.89,2.03,1.8
108,SC0,26/12/2023,15:00,Dundee,Celtic,0,3,A,0,0,...,2.65,1.75,1.88,1.98,1.9,1.99,1.92,2.02,1.86,1.95


## Data Preprocessing

In [39]:
# check for missing values
df.isnull().sum()



Div         0
Date        0
Time        0
HomeTeam    0
AwayTeam    0
           ..
PCAHA       0
MaxCAHH     0
MaxCAHA     0
AvgCAHH     0
AvgCAHA     0
Length: 106, dtype: int64

In [40]:
# Dropping columns with a high number of missing values or irrelevant to our analysis
irrelevant_columns = df.columns[df.isnull().sum() > (0.5 * len(df))]  # Columns with more than 50% missing values
df_cleaned = df.drop(columns=irrelevant_columns)

# Convert 'Date' to datetime
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'], format='%d/%m/%Y')

# Fill missing values in other relevant columns with appropriate values (like median or mode)
# For numerical columns, we use median and for categorical, we use mode
for column in df_cleaned.columns:
    if df_cleaned[column].dtype == 'object':
        df_cleaned[column].fillna(df_cleaned[column].mode()[0], inplace=True)
    else:
        df_cleaned[column].fillna(df_cleaned[column].median(), inplace=True)

# Recheck for missing values
remaining_missing_values = df_cleaned.isnull().sum().sum()

# Basic information after cleaning
remaining_missing_values, df_cleaned.info()

df_cleaned.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Columns: 106 entries, Div to AvgCAHA
dtypes: datetime64[ns](1), float64(82), int64(16), object(7)
memory usage: 90.4+ KB


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,SC0,2023-08-05,12:30,Celtic,Ross County,4,2,H,3,0,...,4.18,-3.0,1.93,1.93,1.96,1.91,2.19,1.95,1.97,1.84
1,SC0,2023-08-05,15:00,Dundee,Motherwell,1,1,D,0,1,...,1.66,-0.25,1.98,1.88,2.04,1.88,2.07,1.93,1.97,1.85
2,SC0,2023-08-05,15:00,Livingston,Aberdeen,0,0,D,0,0,...,1.69,0.0,1.95,1.9,2.03,1.88,2.04,1.95,1.97,1.85
3,SC0,2023-08-05,15:00,St Johnstone,Hearts,0,2,A,0,0,...,1.73,0.25,2.03,1.83,2.03,1.88,2.11,1.88,2.01,1.8
4,SC0,2023-08-05,17:15,Kilmarnock,Rangers,1,0,H,0,0,...,2.21,1.25,2.03,1.83,2.04,1.87,2.09,1.9,1.97,1.84


In [41]:
# Feature Engineering

In [42]:
# recent form
# function to calculate recent form
def calculate_form_points(team, df_cleaned, num_matches=5):
    # Filter the matches for the team
    team_matches = df_cleaned[(df_cleaned['HomeTeam'] == team) | (df_cleaned['AwayTeam'] == team)]
    # Sort by date to get the most recent matches
    team_matches = team_matches.sort_values(by='Date', ascending=False)
    # Get the last 'num_matches' matches
    recent_matches = team_matches.head(num_matches)
    # Calculate form points
    form_points = 0
    for _, row in recent_matches.iterrows():
        if row['HomeTeam'] == team and row['FTR'] == 'H':
            form_points += 3
        elif row['AwayTeam'] == team and row['FTR'] == 'A':
            form_points += 3
        elif row['FTR'] == 'D':
            form_points += 1
    return form_points

# Apply the function to get recent form for each team in each match
df_cleaned['HomeTeamRecentForm'] = df_cleaned.apply(lambda x: calculate_form_points(x['HomeTeam'], df_cleaned), axis=1)
df_cleaned['AwayTeamRecentForm'] = df_cleaned.apply(lambda x: calculate_form_points(x['AwayTeam'], df_cleaned), axis=1)

# Average Goals per Game
df_cleaned['HomeTeamAvgGoals'] = df_cleaned.groupby('HomeTeam')['FTHG'].transform('mean')
df_cleaned['AwayTeamAvgGoals'] = df_cleaned.groupby('AwayTeam')['FTAG'].transform('mean')

# half time average goals
df_cleaned['HomeTeamAvgGoalsHT'] = df_cleaned.groupby('HomeTeam')['HTHG'].transform('mean')
df_cleaned['AwayTeamAvgGoalsHT'] = df_cleaned.groupby('AwayTeam')['HTAG'].transform('mean')

In [43]:
# average goals conceded
def average_goals_conceded(df, team, home_or_away):
    if home_or_away == 'home':
        # Goals conceded at home = Full Time Away Goals when this team is home
        goals_conceded = df[df['HomeTeam'] == team]['FTAG'].mean()
    else:
        # Goals conceded away = Full Time Home Goals when this team is away
        goals_conceded = df[df['AwayTeam'] == team]['FTHG'].mean()
    return goals_conceded

df_cleaned['HomeTeamAvgGoalsConceded'] = df_cleaned['HomeTeam'].apply(lambda x: average_goals_conceded(df_cleaned, x, 'home'))
df_cleaned['AwayTeamAvgGoalsConceded'] = df_cleaned['AwayTeam'].apply(lambda x: average_goals_conceded(df_cleaned, x, 'away'))


In [None]:
print(df_cleaned.head())

In [49]:
# get the current league table
def calculate_points_goals(row):
    home_points, away_points = 0, 0
    home_goal_diff = row['FTHG'] - row['FTAG']
    away_goal_diff = row['FTAG'] - row['FTHG']

    if row['FTR'] == 'H':
        home_points = 3
    elif row['FTR'] == 'A':
        away_points = 3
    else:
        home_points = away_points = 1

    return pd.Series([home_points, away_points, home_goal_diff, away_goal_diff, row['FTHG'], row['FTAG']])

# List of teams
teams = df_cleaned['HomeTeam'].unique()

# Applying the function to the dataset
df_cleaned[['HomePoints', 'AwayPoints', 'HomeGoalDiff', 'AwayGoalDiff', 'HomeGoals', 'AwayGoals']] = df_cleaned.apply(calculate_points_goals, axis=1)

# Summarizing the data for each team
team_stats = pd.DataFrame(index=teams)

# Calculating total points, goal difference, and goals scored for each team
team_stats['Points'] = df_cleaned.groupby('HomeTeam')['HomePoints'].sum() + df_cleaned.groupby('AwayTeam')['AwayPoints'].sum()
team_stats['GoalDiff'] = df_cleaned.groupby('HomeTeam')['HomeGoalDiff'].sum() + df_cleaned.groupby('AwayTeam')['AwayGoalDiff'].sum()
team_stats['GoalsScored'] = df_cleaned.groupby('HomeTeam')['HomeGoals'].sum() + df_cleaned.groupby('AwayTeam')['AwayGoals'].sum()

# Sorting the teams based on Points, Goal Difference, and Goals Scored
sorted_teams = team_stats.sort_values(by=['Points', 'GoalDiff', 'GoalsScored'], ascending=[False, False, False])

sorted_teams.head(12)

Unnamed: 0,Points,GoalDiff,GoalsScored
Celtic,48,34,48
Rangers,43,29,37
Hearts,29,4,19
Kilmarnock,27,2,20
St Mirren,26,-3,21
Hibernian,24,-2,24
Dundee,21,-6,20
Aberdeen,19,-7,19
St Johnstone,18,-14,13
Ross County,17,-9,15


In [50]:
# fit the label encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_cleaned['HomeTeam'] = le.fit_transform(df_cleaned['HomeTeam'])
df_cleaned['AwayTeam'] = le.transform(df_cleaned['AwayTeam'])


### Data Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# select the features and target
features = ['HomeTeam', 'AwayTeam', 'HomeTeamRecentForm', 'AwayTeamRecentForm', 'HomeTeamAvgGoals', 'AwayTeamAvgGoals', 'HomeTeamAvgGoalsHT', 'AwayTeamAvgGoalsHT', 'HomeTeamAvgGoalsConceded', 'AwayTeamAvgGoalsConceded']
target = 'FTR'

# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_cleaned[features], df_cleaned[target], test_size=0.2, random_state=42)