# Get League Table and Stats

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np

In [3]:
# URL of the website
url = "https://fbref.com/en/comps/10/2022-2023/2022-2023-Championship-Stats"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
    # Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

    # Find the table using its HTML class
table = soup.find("table", {"class": "stats_table"})

    # Extract table data into a list of lists
table_data = []
for row in table.find_all("tr"):
    row_data = [cell.text.strip() for cell in row.find_all(["th", "td"])]
    table_data.append(row_data)

    # Create a DataFrame from the list of lists
columns = table_data[0]  # Assuming the first row contains column headers
data = table_data[1:]
league_table_df = pd.DataFrame(data, columns=columns)

# Get Fixtures 

In [4]:
# URL of the website
url = "https://fbref.com/en/comps/10/2022-2023/schedule/2022-2023-Championship-Scores-and-Fixtures"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing fixtures
    table = soup.find("table", {"class": "stats_table"})

    # Extract data into a list of dictionaries
    fixtures_data = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        columns = row.find_all(["th", "td"])
        
         # Extracting specific columns based on the structure of the table
        fixture_type = columns[0].text.strip()
        matchday = columns[1].text.strip()
        weekday = columns[2].text.strip()
        date = columns[3].text.strip()
        time = columns[4].text.strip()
        home_team = columns[5].text.strip()
        home_xG = columns[6].text.strip()
        result = columns[7].text.strip()
        away_team = columns[8].text.strip()
        away_xG = columns[9].text.strip()
        attendance = columns[10].text.strip()
        stadium = columns[11].text.strip()
        referee = columns[12].text.strip()

        fixture_info = {
            "fixture_type": fixture_type,
            "matchday": matchday,
            "weekday": weekday,
            "date": date,
            "time": time,
            "home_team": home_team,
            "home_xG": home_xG,
            "result": result,
            "away_xG": away_xG,
            "away_team": away_team,
            "attendance": attendance,
            "stadium": stadium,
            "referee": referee,
            "competition": English Championship
        }

        fixtures_data.append(fixture_info)

    # Create a DataFrame from the list of dictionaries
    fixtures_table = pd.DataFrame(fixtures_data)

In [5]:
fixtures_table.head()
#need to fix from attendance on

Unnamed: 0,fixture_type,matchday,weekday,date,time,home_team,home_xG,result,away_xG,away_team,attendance,stadium,referee
0,Regular season,1,Fri,2022-07-29,20:00,Huddersfield,0.1,0–1,Burnley,1.1,20206,The John Smith's Stadium,James Linington
1,Regular season,1,Sat,2022-07-30,15:00,Hull City,1.5,2–1,Bristol City,1.0,16667,MKM Stadium,Dean Whitestone
2,Regular season,1,Sat,2022-07-30,15:00,Rotherham Utd,0.6,1–1,Swansea City,0.6,10454,AESSEAL New York Stadium,Matt Donohue
3,Regular season,1,Sat,2022-07-30,15:00,Millwall,0.8,2–0,Stoke City,0.2,15341,The Den,Keith Stroud
4,Regular season,1,Sat,2022-07-30,15:00,Luton Town,1.4,0–0,Birmingham City,0.7,9921,Kenilworth Road Stadium,Graham Scott


In [6]:
fixtures_table['is_game_complete'] = np.where(len(fixtures_table['result']) > 0, 1, 0)

# Getting Game IDs

In [7]:
# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <td> elements with data-stat="score"
score_cells = soup.find_all('td', {'data-stat': 'score'})

# Extract game IDs from the <a> tags within the score_cells, the third element is the game ID
game_ids = [cell.find('a')['href'].split('/')[3] for cell in score_cells if cell.find('a')]
# Then strip the white space in the IDs
game_ids = [x.strip(' ') for x in game_ids]

In [8]:
game_ids = game_ids[0:4]
#just use three to test
#next

# Scraping Shot Data

In [9]:
all_data = []
for unique_id in game_ids:
    # URL of the page to scrape
    url = f'https://fbref.com/en/matches/{unique_id}/'

    # Get the game ID from the URL
    game_id = url.split("/")[-2]

    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the Shots table on the page
    shots_table = soup.find("div", {"id": "switcher_shots"})

    # Create empty lists to store data and create the columns
    data = []
    headers = ['Game_ID', 'Minute', 'Player', 'Team', 'xG', 'PSxG', 'Result', 'Distance', 'Body Part', 'Notes', 'SCA 1 Player', 'Event 1', 'SCA 2 Player', 'Event 2']

    # Extract table rows
    rows = shots_table.find("tbody").find_all("tr")
    for row in rows:
        row_data = [game_id] + [cell.get_text().strip() for cell in row.find_all(["th", "td"])]
        data.append(row_data)

    # Create a DataFrame from the scraped data
    shots_df = pd.DataFrame(data, columns=headers)

    # Find the home and away teams and formations
    team_headers = soup.find_all("th", {"colspan": "2"})
    teams = [header.text.strip().split(" (")[0] for header in team_headers if "(" in header.text.strip()]
    formations = [header.text.strip().split(" (")[1][:-1] for header in team_headers if "(" in header.text.strip()]

    # Extracting the home and away teams and formations
    home_team = teams[0]
    away_team = teams[1]
    home_formation = formations[0]
    away_formation = formations[1]

    # Find the competition -- only using .find because there are multiple in the HTML and we only need one, then take only the text
    competition = soup.find("a", {"href": "/en/comps/10/2022-2023/2022-2023-Championship-Stats"}).text

    # Extract the match date and add to df, this one is a little tricky as it's hidden within the span tag
    span_venuetime = soup.find("span", class_="venuetime")
    match_date = span_venuetime.get("data-venue-date") if span_venuetime else None

    #Get location, 4th element in that list
    location = soup.find("div", {"class": "scorebox_meta"}).find_all("small")[3].text
    
    all_data.append(data)

In [17]:
all_data

[['46d9048f',
  '11',
  'Josh Cullen',
  'Burnley',
  '0.07',
  '0.02',
  'Saved',
  '21',
  'Left Foot',
  '',
  'Ian Maatsen',
  'Pass (Live)',
  'Josh Cullen',
  'Pass (Live)'],
 ['46d9048f',
  '14',
  'Ashley Barnes',
  'Burnley',
  '0.06',
  '0.34',
  'Saved',
  '14',
  'Right Foot',
  '',
  'Dara Costelloe',
  'Pass (Live)',
  'Josh Brownhill',
  'Pass (Live)'],
 ['46d9048f',
  '18',
  'Ian Maatsen',
  'Burnley',
  '0.05',
  '0.28',
  'Goal',
  '15',
  'Right Foot',
  'Deflected',
  'Josh Brownhill',
  'Pass (Live)',
  'Ian Maatsen',
  'Pass (Live)'],
 ['46d9048f',
  '20',
  'Connor Roberts',
  'Burnley',
  '0.21',
  '0.88',
  'Saved',
  '6',
  'Right Foot',
  'Volley',
  'Josh Cullen',
  'Pass (Live)',
  'Connor Roberts',
  'Pass (Live)'],
 ['46d9048f',
  '24',
  'Josh Brownhill',
  'Burnley',
  '0.03',
  '0.20',
  'Saved',
  '18',
  'Right Foot',
  'Volley',
  'Dara Costelloe',
  'Fouled',
  '',
  ''],
 ['46d9048f',
  '38',
  'Josh Koroma',
  'Huddersfield',
  '0.04',
  '',
  '

In [11]:
#Flatten list one level to load into df
all_data = sum(all_data, [])

In [12]:
# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(all_data, columns=headers)

# Add the team, away team, formation, and opponent formation information to the DataFrame
shots_df['Home_Team'] = home_team
shots_df['Away_Team'] = away_team
shots_df['Home Formation'] = home_formation
shots_df['Away Formation'] = away_formation
shots_df['is_home_shot'] = np.where(shots_df['Team'] == home_team, 1, 0)
shots_df['is_away_shot'] = np.where(shots_df['Team'] ==  away_team, 1, 0)
shots_df['location'] = location
shots_df['match_date'] = match_date
shots_df['competition'] = 'English Championship' #Can hard-code this since it will be the same throughout the df

In [14]:
shots_df

Unnamed: 0,Game_ID,Minute,Player,Team,xG,PSxG,Result,Distance,Body Part,Notes,...,SCA 2 Player,Event 2,Home_Team,Away_Team,Home Formation,Away Formation,is_home_shot,is_away_shot,location,match_date
0,46d9048f,11,Josh Cullen,Burnley,0.07,0.02,Saved,21,Left Foot,,...,Josh Cullen,Pass (Live),Millwall,Stoke City,3-4-3,3-5-2,0,0,"The Den, London",2022-07-30
1,46d9048f,14,Ashley Barnes,Burnley,0.06,0.34,Saved,14,Right Foot,,...,Josh Brownhill,Pass (Live),Millwall,Stoke City,3-4-3,3-5-2,0,0,"The Den, London",2022-07-30
2,46d9048f,18,Ian Maatsen,Burnley,0.05,0.28,Goal,15,Right Foot,Deflected,...,Ian Maatsen,Pass (Live),Millwall,Stoke City,3-4-3,3-5-2,0,0,"The Den, London",2022-07-30
3,46d9048f,20,Connor Roberts,Burnley,0.21,0.88,Saved,6,Right Foot,Volley,...,Connor Roberts,Pass (Live),Millwall,Stoke City,3-4-3,3-5-2,0,0,"The Den, London",2022-07-30
4,46d9048f,24,Josh Brownhill,Burnley,0.03,0.20,Saved,18,Right Foot,Volley,...,,,Millwall,Stoke City,3-4-3,3-5-2,0,0,"The Den, London",2022-07-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,8cdd19be,65,Charlie Cresswell,Millwall,0.08,0.04,Saved,9,Head,,...,,,Millwall,Stoke City,3-4-3,3-5-2,1,0,"The Den, London",2022-07-30
82,8cdd19be,65,Charlie Cresswell,Millwall,0.18,0.45,Goal,8,Right Foot,Volley,...,Charlie Cresswell,Shot,Millwall,Stoke City,3-4-3,3-5-2,1,0,"The Den, London",2022-07-30
83,8cdd19be,75,Jacob Brown,Stoke City,0.02,,Off Target,12,Right Foot,Volley,...,Lewis Baker,Pass (Dead),Millwall,Stoke City,3-4-3,3-5-2,0,1,"The Den, London",2022-07-30
84,8cdd19be,88,Benik Afobe,Millwall,0.09,,Blocked,13,Right Foot,,...,Zian Flemming,Pass (Live),Millwall,Stoke City,3-4-3,3-5-2,1,0,"The Den, London",2022-07-30


In [16]:
#home team, away team, and formations are all messed up -- need to fix those

# To Do

1. EDA
2. Start Modeling