# Get League Table and Stats

In [157]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import numpy as np

In [158]:
# URL of the website
url = "https://fbref.com/en/comps/10/Championship-Stats"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
    # Parse the HTML content of the page
soup = BeautifulSoup(response.text, "html.parser")

    # Find the table using its HTML class
table = soup.find("table", {"class": "stats_table"})

    # Extract table data into a list of lists
table_data = []
for row in table.find_all("tr"):
    row_data = [cell.text.strip() for cell in row.find_all(["th", "td"])]
    table_data.append(row_data)

    # Create a DataFrame from the list of lists
columns = table_data[0]  # Assuming the first row contains column headers
data = table_data[1:]
league_table_df = pd.DataFrame(data, columns=columns)

# Get Fixtures 

In [159]:
# URL of the website
url = "https://fbref.com/en/comps/10/schedule/Championship-Scores-and-Fixtures"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing fixtures
    table = soup.find("table", {"class": "stats_table"})

    # Extract data into a list of dictionaries
    fixtures_data = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        columns = row.find_all(["th", "td"])
        
         # Extracting specific columns based on the structure of the table
        fixture_type = columns[0].text.strip()
        weekday = columns[1].text.strip()
        date = columns[2].text.strip()
        time = columns[3].text.strip()
        home_team = columns[4].text.strip()
        home_xG = columns[5].text.strip()
        result = columns[6].text.strip()
        away_xG = columns[7].text.strip()
        away_team = columns[8].text.strip()
        attendance = columns[9].text.strip()
        stadium = columns[10].text.strip()
        referee = columns[11].text.strip()

        fixture_info = {
            "fixture_type": fixture_type,
            "weekday": weekday,
            "date": date,
            "time": time,
            "home_team": home_team,
            "home_xG": home_xG,
            "result": result,
            "away_xG": away_xG,
            "away_team": away_team,
            "attendance": attendance,
            "stadium": stadium,
            "referee": referee
        }

        fixtures_data.append(fixture_info)

    # Create a DataFrame from the list of dictionaries
    fixtures_table = pd.DataFrame(fixtures_data)

In [160]:
fixtures_table.head()
#need to fix from attendance on

Unnamed: 0,fixture_type,weekday,date,time,home_team,home_xG,result,away_xG,away_team,attendance,stadium,referee
0,1,Fri,2023-08-04,20:00,Sheffield Weds,0.5,1–2,1.4,Southampton,28558,Hillsborough Stadium,Robert Madley
1,1,Sat,2023-08-05,15:00,Bristol City,0.9,1–1,1.3,Preston,29359,Ashton Gate Stadium,David Webb
2,1,Sat,2023-08-05,15:00,Plymouth Argyle,2.4,3–1,2.0,Huddersfield,16446,Home Park,Matt Donohue
3,1,Sat,2023-08-05,15:00,Stoke City,2.4,4–1,0.9,Rotherham Utd,22601,Bet365 Stadium,John Busby
4,1,Sat,2023-08-05,15:00,Middlesbrough,0.8,0–1,1.2,Millwall,29359,Riverside Stadium,Gavin Ward


In [161]:
fixtures_table['is_game_complete'] = np.where(len(fixtures_table['result']) > 0, 1, 0)
#Checking to see if game is complete... if the result column is populated at all, it's been played

# Getting Game IDs

In [162]:
# URL of the website
url = "https://fbref.com/en/comps/10/schedule/Championship-Scores-and-Fixtures"

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Find all <td> elements with data-stat="score"
score_cells = soup.find_all('td', {'data-stat': 'score'})

# Extract game IDs from the <a> tags within the score_cells, the third element is the game ID
game_ids = [cell.find('a')['href'].split('/')[3] for cell in score_cells if cell.find('a')]
# Then strip the white space in the IDs
game_ids = [x.strip(' ') for x in game_ids]

In [163]:
game_ids = game_ids[0:4]
#just use three to test

# Scraping Shot Data

In [164]:
all_data = []

for unique_id in game_ids:
    # URL of the page to scrape
    url = f'https://fbref.com/en/matches/{unique_id}/'

    # Get the game ID from the URL
    game_id = url.split("/")[-2]

    # Send a GET request to the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the Shots table on the page
    shots_table = soup.find("div", {"id": "switcher_shots"})

    # Create empty lists to store data and create the columns
    data = []
    headers = ['game_id', 'minute', 'player', 'team', 'xG', 'psxG', 'result', 'distance', 'body_part', 'notes', 'sca_1_player', 'event_1', 'sca_2_player', 'event_2']

    # Extract table rows
    rows = shots_table.find("tbody").find_all("tr")
    for row in rows:
        row_data = [game_id] + [cell.get_text().strip() for cell in row.find_all(["th", "td"])]
        data.append(row_data)

    # Find the home and away teams and formations
    team_headers = soup.find_all("th", {"colspan": "2"})
    teams = [header.text.strip().split(" (")[0] for header in team_headers if "(" in header.text.strip()]
    formations = [header.text.strip().split(" (")[1][:-1] for header in team_headers if "(" in header.text.strip()]

    # Extracting the home and away teams and formations
    home_team = teams[0]
    away_team = teams[1]
    home_formation = formations[0]
    away_formation = formations[1]

    # Find the location and match date
    location = soup.find("div", {"class": "scorebox_meta"}).find_all("small")[3].text
    match_date = soup.find("span", class_="venuetime").get("data-venue-date", None)

    # Append additional columns to headers
    headers.extend(['home_team', 'away_team', 'home_formation', 'away_formation', 'location', 'match_date'])

    # Append data to all_data with additional columns
    for row in data:
        row.extend([home_team, away_team, home_formation, away_formation, location, match_date])
        all_data.append(row)
        
# Create a DataFrame from the scraped data
shots_df = pd.DataFrame(all_data, columns=headers)

In [165]:
# Add 
shots_df['is_home_shot'] = np.where(shots_df['team'] == shots_df['home_team'], 1, 0)
shots_df['is_away_shot'] = np.where(shots_df['team'] ==  shots_df['away_team'], 1, 0)
shots_df['location'] = location
shots_df['match_date'] = match_date
shots_df['competition'] = 'English Championship'

In [166]:
# Define a function to calculate total score for each game
def calculate_total_score(group, shot):
    total_score = 0
    total_scores = []

    # Flag to indicate whether the current row is after a 'Goal'
    after_goal = False

    for idx, (result, is_shot) in enumerate(zip(group, shot)):
        # Check if the current row is after a 'Goal'
        if after_goal:
            # Increment the total score
            total_score += 1
            # Reset the flag
            after_goal = False

        # Check if the current result is 'Goal' and the corresponding shot is 1
        if result == 'Goal' and is_shot == 1:
            # Set the flag to True for the next iteration
            after_goal = True
        
        # Append the current total score
        total_scores.append(total_score)

    return total_scores

# Group by game_id and apply the function to calculate total score for home team
shots_df['home_score'] = shots_df.groupby('game_id')['result'].transform(lambda x: calculate_total_score(x, shots_df['is_home_shot']))

# Group by game_id and apply the function to calculate total score for away team
shots_df['away_score'] = shots_df.groupby('game_id')['result'].transform(lambda x: calculate_total_score(x, shots_df['is_away_shot']))


In [167]:
#Creating a nil nil column using numpy where and logical_and operators
shots_df['is_nil_nil'] = np.where(np.logical_and(shots_df['home_score'] == 0, shots_df['away_score'] == 0), 1, 0)

In [169]:
#Adding opposing team -- Whenever it's a home shot we want the away team, otherwise (away shot) we want the home team
shots_df['opposing_team'] = np.where(shots_df['is_home_shot'] == 1, shots_df['away_team'], shots_df['home_team'])

In [188]:
#Replace blank values with nulls
shots_df['xG'] = shots_df['xG'].replace('', np.nan)

#Then drop them
shots_df = shots_df.dropna(subset=['xG'])

In [192]:
shots_df['xG'] = shots_df['xG'].astype('float64') 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shots_df['xG'] = shots_df['xG'].astype('float64')


# Adding xG while nil-nil to league_table_df

In [197]:
# Filter shots_df where is_nil_nil is 0
filtered_shots_df = shots_df[shots_df['is_nil_nil'] == 1]

# Calculate the sum of 'xG' for each 'team'
team_xG_sum = filtered_shots_df.groupby('team')['xG'].sum().reset_index()

# Create a dictionary to map team names to their respective summed 'xG' values
team_xG_dict = dict(zip(team_xG_sum['team'], team_xG_sum['xG']))

# Add 'nil_nil_xG' column to league_table_df by mapping team names to their summed 'xG' values
league_table_df['nil_nil_xG'] = league_table_df['Squad'].map(team_xG_dict)

# Fill NaN values with 0 if any team didn't have any 'xG' when is_nil_nil is 0
league_table_df['nil_nil_xG'].fillna(0, inplace=True)


In [200]:
team_xG_sum

Unnamed: 0,team,xG
0,Bristol City,0.88
1,Plymouth Argyle,0.75
2,Sheffield Weds,0.36
3,Southampton,0.07
4,Stoke City,0.2


# To Do

1a. Fix Stadium
1b. fix nil nil logic
1. EDA
2. Start Modeling