<a href="https://colab.research.google.com/github/diMaster228/predicting-model/blob/main/cursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, accuracy_score

Сбор данных, парсинг

In [None]:
years = list(range(2024, 2018, -1))
years
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"
all_matches = []
import time
start_time = time.time()
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]

    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"

    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            team_data = matches.merge(shooting[["Date", "Sh", "SoT", "Dist", "FK", "PK", "PKatt"]], on="Date")
        except ValueError:
            continue
        team_data = team_data[team_data["Comp"] == "Premier League"]

        team_data["Season"] = year
        team_data["Team"] = team_name
        all_matches.append(team_data)
        time.sleep(1)

end_time = time.time()
result = end_time - start_time



In [None]:
match_df = pd.concat(all_matches)
match_df.columns = [c.lower() for c in match_df.columns]

In [None]:
match_df.to_csv("matches.csv")
match_df

Обучение и оценка моделей

In [None]:
matches = pd.read_csv("matches.csv", index_col=0)

In [None]:
del matches["comp"]
del matches["notes"]

matches["date"] = pd.to_datetime(matches["date"])

# Создаем словарь, в котором каждому типу результата (W, D, L) соответствует его значение (2, 1, 0)
result_mapping = {'W': 2, 'D': 1, 'L': 0}

# Заменяем значения в столбце 'result' на соответствующие значения из словаря result_mapping
matches["target"] = matches["result"].replace(result_mapping)

matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["xg_code"] = matches["xg"].astype("int")
matches["xga_code"] = matches["xga"].astype("int")

label_encoder = LabelEncoder()
matches["referee_code"] = label_encoder.fit_transform(matches["referee"])


In [None]:
def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)

def evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=None,labels=[0, 1, 2])
    return accuracy, precision

In [None]:
train = matches[matches["date"] < '2023-08-11']
test = matches[matches["date"] > '2023-08-11']

predictors = ["venue_code", "opp_code", "hour", "day_code", "xg_code", "xga_code", "referee_code"]
X_train = train[predictors]
y_train = train["target"]
X_test = test[predictors]
y_test = test["target"]

models = {
    'RandomForest': RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1),
    'LogisticRegression': LogisticRegression(max_iter=10000),
    'XGBoost': xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y_train)), eval_metric='merror')
}


In [None]:
precisions = []
accuracies = []

for model_name, model in models.items():
    print(f"Model: {model_name}")
    y_pred = train_and_predict(model, X_train, y_train, X_test)
    combined = pd.DataFrame(dict(actual=y_test, predicted=y_pred))
    crosstab = pd.crosstab(index=combined["actual"], columns=combined["predicted"])
    accuracy, precision = evaluate(y_test, y_pred)
    precisions.append(precision)
    accuracies.append(accuracy)
    print(crosstab)
    print()
    print(f"Accuracy of {model_name} method:", accuracy)
    print(f"Precision of {model_name} method:", precision)
    print()

In [None]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Arsenal")

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


In [None]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

In [None]:
train = matches_rolling[matches_rolling["date"] < '2023-08-11']
test = matches_rolling[matches_rolling["date"] > '2023-08-11']

X_train = train[predictors]
y_train = train["target"]
X_test = test[predictors]
y_test = test["target"]

In [None]:
precisions = []
accuracies = []

  for model_name, model in models.items():
    print(f"Model: {model_name}")
    y_pred = train_and_predict(model, X_train, y_train, X_test)
    combined = pd.DataFrame(dict(actual=y_test, predicted=y_pred))
    crosstab = pd.crosstab(index=combined["actual"], columns=combined["predicted"])
    accuracy, precision = evaluate(y_test, y_pred)
    print(crosstab)
    print()
    print(f"Accuracy of {model_name} method:", accuracy)
    print(f"Precision of {model_name} method:", precision)
    print()

In [None]:
combined = combined.merge(matches_rolling[["date","team","opponent","result"]],left_index=True, right_index=True)
combined

In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton", "Manchester United": "Manchester Utd", "Newcastle United": "Newcastle Utd", "Tottenham Hotspur": "Tottenham", "West Ham United": "West Ham", "Wolverhampton Wanderers": "Wolves", "Nottingham Forest": "Nott'ham Forest", "Sheffield United": "Sheffield Utd"}
mapping = MissingDict(**map_values)

combined["new_team"] = combined["team"].map(mapping)

merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])
merged

In [None]:
# Выберем только нужные столбцы
subset = merged[['new_team_x', 'predicted_x', 'new_team_y', 'predicted_y']]

# Переименуем столбцы, чтобы они не конфликтовали друг с другом
subset.columns = ['team', 'predicted_x', 'opponent', 'predicted_y']

# Объединим данные в один датафрейм
all_teams = pd.concat([subset[['team', 'predicted_x']], subset[['opponent', 'predicted_y']].rename(columns={'opponent': 'team', 'predicted_y': 'predicted_x'})])

# Посчитаем суммарное количество предсказанных очков для каждой команды
predicted_points = all_teams.groupby('team')['predicted_x'].sum().reset_index()

# Переименуем столбцы для ясности
predicted_points.columns = ['Team', 'Predicted Points']

predicted_points = predicted_points.sort_values(by='Predicted Points', ascending=False)
predicted_points.index = range(predicted_points.shape[0])

winner_index = predicted_points['Predicted Points'].idxmax()
winner = predicted_points.loc[winner_index, 'Team']

print("Победитель лиги:", winner)
predicted_points

In [None]:
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [None]:
# Отправляем GET-запрос на страницу и получаем HTML-код
response = requests.get(url)
html_content = response.text

# Используем BeautifulSoup для парсинга HTML-кода
soup = BeautifulSoup(html_content, "html.parser")

# Находим таблицу с данными
table = soup.find("table", class_="stats_table")

# Преобразуем таблицу в DataFrame с помощью pandas
df = pd.read_html(str(table))[0]

df;

points_df = df[["Squad", "Pts"]]
points_df = points_df.rename(columns={'Pts': 'Actual_Pts'})
points_df

In [None]:
from tabulate import tabulate

# Преобразование датафреймов в строковое представление
table1 = points_df.to_string(index=False)
table2 = predicted_points.to_string(index=False)

# Вывод двух датафреймов в одной области
print(tabulate([['Actual Points', 'Predicted Points'], [table1, table2]], headers='firstrow'))