In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Read the CSV file
file_path = "com.csv"
df = pd.read_csv(file_path, parse_dates=["Date"], dayfirst=True)

# Function to get the past 5 matches for a given team
def get_past_matches(team, date):
    past_matches = df[((df['HomeTeam'] == team) | (df['AwayTeam'] == team)) & (df['Date'] < date)]
    past_matches = past_matches.sort_values(by='Date', ascending=False).head(5)
    return past_matches

# Create features for the model
def create_features():
    data = []

    for index, row in df[df['Date'] >= '2008-01-01'].iterrows():
        date = row['Date']
        home_team = row['HomeTeam']
        away_team = row['AwayTeam']
        home_goals = row['FTHG']
        away_goals = row['FTAG']

        home_past_matches = get_past_matches(home_team, date)
        away_past_matches = get_past_matches(away_team, date)

        # Calculate mean values of numerical columns
        numerical_columns = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
        home_stats = home_past_matches[numerical_columns].mean()
        away_stats = away_past_matches[numerical_columns].mean()
        
        if home_goals > away_goals:
            result = 1  # Home win
        elif home_goals < away_goals:
            result = 0  # Away win
        else:
            result = 2  # Draw
        
        data.append({
            'Date': date,
            'HomeWins': (home_past_matches['FTHG'] > home_past_matches['FTAG']).sum(),
            'HomeLosses': (home_past_matches['FTHG'] < home_past_matches['FTAG']).sum(),
            'HomeDraws': (home_past_matches['FTHG'] == home_past_matches['FTAG']).sum(),
            'AwayWins': (away_past_matches['FTHG'] > away_past_matches['FTAG']).sum(),
            'AwayLosses': (away_past_matches['FTHG'] < away_past_matches['FTAG']).sum(),
            'AwayDraws': (away_past_matches['FTHG'] == away_past_matches['FTAG']).sum(),
            **{f'Home{col}': home_stats[col] for col in numerical_columns},
            **{f'Away{col}': away_stats[col] for col in numerical_columns},
            'Result': result
        })

    return pd.DataFrame(data)

# Generate the features and labels
features_df = create_features()

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = features_df.drop(columns=['Date', 'Result'])
X = imputer.fit_transform(X)
y = features_df['Result']


In [20]:
print(features_df.head)
#print(df.head)

<bound method NDFrame.head of            Date  HomeWins  HomeLosses  HomeDraws  AwayWins  AwayLosses  \
0    2008-01-01         3           1          1         1           3   
1    2008-01-01         0           2          3         4           1   
2    2008-01-01         2           1          2         3           1   
3    2008-01-01         3           2          0         3           0   
4    2008-01-01         2           3          0         3           2   
...         ...       ...         ...        ...       ...         ...   
6466 2024-11-09         2           2          1         1           1   
6467 2024-11-10         2           1          2         2           2   
6468 2024-11-10         2           2          1         2           1   
6469 2024-11-10         4           1          0         2           1   
6470 2024-11-10         3           0          2         4           0   

      AwayDraws  HomeFTHG  HomeFTAG  HomeHTHG  ...  AwayAST  AwayHC  AwayAC  \
0 

In [9]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the SVM model with multi-class classification
model = svm.SVC(kernel='linear', decision_function_shape='ovr')
model.fit(X_train, y_train)



# Predict the results for the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")



Model accuracy: 47.49%


In [16]:
def predict_match_result(home_team, away_team, date):
    home_past_matches = get_past_matches(home_team, date)
    away_past_matches = get_past_matches(away_team, date)

    # Calculate mean values of numerical columns
    numerical_columns = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
    home_stats = home_past_matches[numerical_columns].mean()
    away_stats = away_past_matches[numerical_columns].mean()

    features = pd.DataFrame([{
        'HomeWins': (home_past_matches['FTHG'] > home_past_matches['FTAG']).sum(),
        'HomeLosses': (home_past_matches['FTHG'] < home_past_matches['FTAG']).sum(),
        'HomeDraws': (home_past_matches['FTHG'] == home_past_matches['FTAG']).sum(),
        'AwayWins': (away_past_matches['FTHG'] > away_past_matches['FTAG']).sum(),
        'AwayLosses': (away_past_matches['FTHG'] < away_past_matches['FTAG']).sum(),
        'AwayDraws': (away_past_matches['FTHG'] == away_past_matches['FTAG']).sum(),
        **{f'Home{col}': home_stats[col] for col in numerical_columns},
        **{f'Away{col}': away_stats[col] for col in numerical_columns}
    }])

    # Handle NaN values by imputing with the mean
    features = imputer.transform(features)

    prediction = model.predict(features)
    if prediction == 1:
        return "asdsadasdn"
    elif prediction == 0:
        return "Away win"
    else:
        return "Draw"

# Example prediction
date = pd.to_datetime("15/12/2056", dayfirst=True)
home_team = "Southampton"
away_team = "kdkscd"
result = predict_match_result(home_team, away_team, date)
print(f"Predicted result: {result}")


Predicted result: asdsadasdn
