<a href="https://colab.research.google.com/github/fopamesmin/movie-project/blob/main/laliga.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install colorama

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import pandas as pd
import requests as req
from bs4 import BeautifulSoup as BS
import logging
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Constants
sofifa_url = "https://www.laliga.com/en-ES/laliga-easports/standing"
COLUMNS = ["position", "team", "Points", "W", "D", "L", "GF", "GA", "GD"]
ID_seasonId = [  "2020/2021", "2021/2022", "2022/2023"]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
logging.basicConfig(level=logging.INFO)
dataset_url = "./DataSet"
os.makedirs(dataset_url, exist_ok=True)

def fetch_data(season):
    teams_list = []
    response = req.get(sofifa_url, headers=headers)
    if response.status_code == req.codes.ok:
        soup = BS(response.content, "lxml")
        rows = soup.select("div.styled__StandingTabBody-sc-e89col-0.isRHqh")
        for row in rows:
            team_data = []
            try:
                position = row.find("div", class_="styled__Td-sc-e89col-10 fTFWtb").find("p").text.strip()
                team = row.find("div", class_="styled__ShieldContainer-sc-1opls7r-0 eIaTDi shield-desktop").find("p").text.strip()
                points = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[0].find("p").text.strip()
                w = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[1].find("p").text.strip()
                d = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[2].find("p").text.strip()
                l = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[3].find("p").text.strip()
                gf = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[4].find("p").text.strip()
                ga = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[5].find("p").text.strip()
                gd = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[6].find("p").text.strip()

                team_data.extend([season, position, team, points, w, d, l, gf, ga, gd])
                teams_list.append(team_data)
            except Exception as e:
                logging.error(f"Error parsing row: {e}")
                continue
    else:
        logging.error(f"Failed to retrieve data for season {season}")
    return teams_list

# Fetching data for all seasons
all_teams_list = []
for season in ID_seasonId:
    all_teams_list.extend(fetch_data(season))

df = pd.DataFrame(all_teams_list, columns=["season"] + COLUMNS)
df.to_csv(os.path.join(dataset_url, "all_seasons.csv"), header=True, index=False)

# Data preprocessing and feature engineering
df['W'] = df['W'].astype(int)
df['D'] = df['D'].astype(int)
df['L'] = df['L'].astype(int)
df['GF'] = df['GF'].astype(int)
df['GA'] = df['GA'].astype(int)
df['GD'] = df['GD'].astype(int)
df['Points'] = df['Points'].astype(int)
df['PLAYED'] = df['W'] + df['D'] + df['L']

# Calculating the target variables
df['Win_Percentage'] = (df['W'] / df['PLAYED']) * 100
df['Draw_Percentage'] = (df['D'] / df['PLAYED']) * 100

# Model training
X = df[['Points', 'GF', 'GA', 'GD']].values
y_win = df['Win_Percentage'].values
y_draw = df['Draw_Percentage'].values

X_train, X_test, y_train_win, y_test_win = train_test_split(X, y_win, test_size=0.2, random_state=42)
X_train, X_test, y_train_draw, y_test_draw = train_test_split(X, y_draw, test_size=0.2, random_state=42)

model_win = LinearRegression()
model_draw = LinearRegression()

model_win.fit(X_train, y_train_win)
model_draw.fit(X_train, y_train_draw)

# Evaluation
y_pred_win = model_win.predict(X_test)
y_pred_draw = model_draw.predict(X_test)

mae_win = mean_absolute_error(y_test_win, y_pred_win)
mae_draw = mean_absolute_error(y_test_draw, y_pred_draw)

print("Mean Absolute Error for Win Percentage:", mae_win)
print("Mean Absolute Error for Draw Percentage:", mae_draw)

# Predicting for 2024 (assuming some hypothetical data)
# Replace this with actual data for the prediction year
hypothetical_data = [[85, 80, 40, 40]]  # Example: [Points, GF, GA, GD]
pred_win_percentage_2024 = model_win.predict(hypothetical_data)
pred_draw_percentage_2024 = model_draw.predict(hypothetical_data)

print("Predicted Win Percentage for 2024:", pred_win_percentage_2024[0])
print("Predicted Draw Percentage for 2024:", pred_draw_percentage_2024[0])

Mean Absolute Error for Win Percentage: 2.227214084870546
Mean Absolute Error for Draw Percentage: 3.7879158377246887
Predicted Win Percentage for 2024: 121.22469769957397
Predicted Draw Percentage for 2024: 22.479773668066237


In [4]:
import os
import pandas as pd
import requests as req
from bs4 import BeautifulSoup as BS
import logging

# Constants
sofifa_url = "https://www.laliga.com/en-ES/laliga-easports/standing"
COLUMNS = ["position", "team", "Points", "W", "D", "L", "GF", "GA", "GD"]
ID_seasonId = [ "2019/2020", "2020/2021", "2021/2022", "2022/2023"]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
logging.basicConfig(level=logging.INFO)
dataset_url = "./DataSet"
os.makedirs(dataset_url, exist_ok=True)

def fetch_data(season):
    teams_list = []
    response = req.get(sofifa_url, headers=headers)
    if response.status_code == req.codes.ok:
        soup = BS(response.content, "lxml")
        rows = soup.select("div.styled__StandingTabBody-sc-e89col-0.isRHqh")
        for row in rows:
            team_data = []
            try:
                position = row.find("div", class_="styled__Td-sc-e89col-10 fTFWtb").find("p").text.strip()
                team = row.find("div", class_="styled__ShieldContainer-sc-1opls7r-0 eIaTDi shield-desktop").find("p").text.strip()
                points = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[0].find("p").text.strip()
                w = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[1].find("p").text.strip()
                d = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[2].find("p").text.strip()
                l = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[3].find("p").text.strip()
                gf = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[4].find("p").text.strip()
                ga = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[5].find("p").text.strip()
                gd = row.find_all("div", class_="styled__Td-sc-e89col-10 feNufd")[6].find("p").text.strip()

                team_data.extend([season, position, team, points, w, d, l, gf, ga, gd])
                teams_list.append(team_data)
            except Exception as e:
                logging.error(f"Error parsing row: {e}")
                continue
    else:
        logging.error(f"Failed to retrieve data for season {season}")
    return teams_list

# Fetching data for all seasons
all_teams_list = []
for season in ID_seasonId:
    all_teams_list.extend(fetch_data(season))

df = pd.DataFrame(all_teams_list, columns=["season"] + COLUMNS)

# Data preprocessing
df['W'] = df['W'].astype(int)
df['D'] = df['D'].astype(int)
df['L'] = df['L'].astype(int)
df['PLAYED'] = df['W'] + df['D'] + df['L']

# Calculating total matches played, total wins, and total draws
total_matches_played = df['PLAYED'].sum()
total_wins = df['W'].sum()
total_draws = df['D'].sum()

# Calculating total percentages
total_win_percentage = (total_wins / total_matches_played) * 100
total_draw_percentage = (total_draws / total_matches_played) * 100

print(f"Pourcentage total de matchs gagnés: {total_win_percentage}")
print(f"Pourcentage total de matchs nuls: {total_draw_percentage}")

Pourcentage total de matchs gagnés: 61.003236245954696
Pourcentage total de matchs nuls: 22.006472491909385
