<a href="https://colab.research.google.com/github/chorltonm/fa-cup-upsets/blob/main/notebooks/source_apis/rivalries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install fuzzywuzzy

import requests
import math
import json
import subprocess
import os
import re
import pandas as pd

from google.cloud import bigquery
from google.oauth2 import service_account
from google.colab import drive
from google.colab import userdata

from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Change default ouput directory
os.chdir('/content/drive/MyDrive/birkbeck_msc-project/output_files')

# Google Service Account

# Load the JSON key from local Google Collab file
key = json.load(open('/content/drive/MyDrive/service_account.json', 'r'))

# Authenticate using the loaded key
credentials = service_account.Credentials.from_service_account_info(key)

# Set up the BigQuery client with the credentials to project
client = bigquery.Client(credentials=credentials, project='birkbeck-msc-project-422917')

Mounted at /content/drive


In [3]:
# Fetch the webpage
url = "https://en.wikipedia.org/wiki/List_of_association_football_rivalries_in_the_United_Kingdom"
response = requests.get(url)
html_content = response.content

# Parse the HTML
soup = BeautifulSoup(html_content, "html.parser")

# Find the section containing the rivalries for England and Wales
rivalries_section_start = soup.find("h2", string="England and Wales")
rivalries_section_end = soup.find("h2", string="Northern Ireland")

# Extract the HTML content between the start and end markers
start_index = str(soup).index(str(rivalries_section_start))
end_index = str(soup).index(str(rivalries_section_end))
rivalries_content = str(soup)[start_index:end_index]


# Parse the extracted HTML content
rivalries_soup = BeautifulSoup(rivalries_content, "html.parser")

rivalries = []
for rivalry_item in rivalries_soup.find_all("li"):
    rivalry_text = rivalry_item.text.split(":")
    if len(rivalry_text) > 1:
        rivalry_name = rivalry_text[0].strip()
        rivalry_details = rivalry_text[1].strip()
    else:
        rivalry_name = ""
        rivalry_details = rivalry_text[0].strip()

    rivalry_lines = rivalry_details.split('\n')
    for line in rivalry_lines:
        if ' vs. ' in line:
            team_names = line.split(' vs. ')
            team1_name = team_names[0].strip()
            team2_name = team_names[1].strip()
            team1_name = re.sub(r'\[\d+\]', '', team1_name)
            team2_name = re.sub(r'\[\d+\]', '', team2_name)
            rivalries.append({"rivalry_name": rivalry_name, "team1_name": team1_name, "team2_name": team2_name})
    if not rivalry_lines:
        rivalries.append({"rivalry_name": rivalry_name, "team1_name": rivalry_details, "team2_name": ""})
def remove_sq_brackets (rivalry_name):
    return re.sub(r'\[\d+\]', '', rivalry_name)

def remove_fc_afc(team_name):
    return re.sub(r'\sF\.C\.', '', re.sub(r'\sA\.F\.C\.', '', team_name))

rivalries_cleaned = []
for rivalry in rivalries:
      rivalry_name = remove_sq_brackets(rivalry["rivalry_name"])
      team1_name = remove_fc_afc(rivalry["team1_name"])
      team2_name = remove_fc_afc(rivalry["team2_name"])
      rivalries_cleaned.append({"rivalry_name": rivalry_name, "team1_name": team1_name, "team2_name": team2_name})

rivalries_df = pd.DataFrame(rivalries_cleaned)

# Manual updates for outliers

rivalries_df.loc[rivalries_df['rivalry_name'] == 'Northamptonshire derby', 'team2_name'] = 'Rushden & Diamonds'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'East London derby', 'team2_name'] = 'West Ham United'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Merseyside derby', 'team2_name'] = 'Liverpool'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Rules derby', 'team2_name'] = 'Sheffield'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Chesterfield-Sheffield rivalry', 'team2_name'] = 'Sheffield United'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Severnside derby', 'team1_name'] = 'Bristol City'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Severnside derby', 'team2_name'] = 'Cardiff City'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'West Yorkshire derby', 'team1_name'] = 'Liversedge'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Brentford-Watford rivalry', 'team2_name'] = 'Watford'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'QPR-Watford rivalry', 'team2_name'] = 'Watford'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Everton–Millwall rivalry', 'team2_name'] = 'Millwall'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'West Yorkshire derby', 'team1_name'] = 'Leeds United'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'West Yorkshire derby', 'team2_name'] = 'Bradford City'
rivalries_df.loc[rivalries_df['team1_name'] == 'Millwall–Nottingham Forest rivalry Millwall', 'team1_name'] = 'Millwall'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Gillingham-Millwall Rivalry', 'team2_name'] = 'Millwall'
rivalries_df.loc[rivalries_df['rivalry_name'] == 'Shropshire derby', 'team2_name'] = 'Telford United'
rivalries_df.loc[rivalries_df['team1_name'] == 'Wolverhampton Wanderers', 'team1_name'] = 'Wolverhampton'
rivalries_df.loc[rivalries_df['team2_name'] == 'Wolverhampton Wanderers', 'team2_name'] = 'Wolverhampton'

new_rows = [
    {'rivalry_name': 'Northamptonshire derby', 'team1_name': 'Kettering', 'team2_name': 'Northampton Town'},
    {'rivalry_name': 'Northamptonshire derby', 'team1_name': 'Kettering', 'team2_name': 'Corby Town'},
    {'rivalry_name': 'Northamptonshire derby', 'team1_name': 'Rushden & Diamonds', 'team2_name': 'Northampton Town'},
    {'rivalry_name': 'Northamptonshire derby', 'team1_name': 'Rushden & Diamonds', 'team2_name': 'Corby Town'},
    {'rivalry_name': 'East London derby', 'team1_name': 'West Ham United', 'team2_name': 'Dagenham & Redbridge'},
    {'rivalry_name': 'East London derby', 'team1_name': '	Leyton Orient', 'team2_name': 'Dagenham & Redbridge'},
    {'rivalry_name': 'Merseyside derby', 'team1_name': 'Liverpool', 'team2_name': 'Tranmere Rovers'},
    {'rivalry_name': 'Merseyside derby', 'team1_name': 'Liverpool', 'team2_name': 'Southport'},
    {'rivalry_name': 'Merseyside derby', 'team1_name': 'Everton', 'team2_name': 'Tranmere Rovers'},
    {'rivalry_name': 'Merseyside derby', 'team1_name': 'Everton', 'team2_name': 'Southport'},
    {'rivalry_name': 'Chesterfield-Sheffield rivalry', 'team1_name': 'Chesterfield', 'team2_name': 'Sheffield Wednesday'},
    {'rivalry_name': 'Severnside derby', 'team1_name': 'Bristol City', 'team2_name': 'Bristol Rovers'},
    {'rivalry_name': 'Severnside derby', 'team1_name': 'Bristol City', 'team2_name': 'Newport County'},
    {'rivalry_name': 'Severnside derby', 'team1_name': 'Cardiff City', 'team2_name': 'Bristol Rovers'},
    {'rivalry_name': 'Severnside derby', 'team1_name': 'Cardiff City', 'team2_name': 'Newport County'},
    {'rivalry_name': 'West Yorkshire derby', 'team1_name': 'Leeds United', 'team2_name': 'Halifax Town'},
    {'rivalry_name': 'West Yorkshire derby', 'team1_name': 'Leeds United', 'team2_name': 'Huddersfield Town'},
    {'rivalry_name': 'West Yorkshire derby', 'team1_name': 'Bradford City', 'team2_name': 'Halifax Town'},
    {'rivalry_name': 'West Yorkshire derby', 'team1_name': 'Bradford City', 'team2_name': 'Huddersfield Town'},
    {'rivalry_name': 'West Yorkshire derby', 'team1_name': 'Huddersfield Town', 'team2_name': 'Halifax Town'}
]

new_rows_df = pd.DataFrame(new_rows)
rivalries_df = pd.concat([rivalries_df, new_rows_df], ignore_index=True)
display(rivalries_df)


Unnamed: 0,rivalry_name,team1_name,team2_name
0,A420 derby,Oxford United,Swindon Town
1,Aylesbury derby,Aylesbury United,Aylesbury Vale Dynamos
2,Bedford derby,Bedford Town,Real Bedford
3,Berkshire derby,Maidenhead United,Slough Town
4,Buckinghamshire derby,Milton Keynes Dons,Wycombe Wanderers
...,...,...,...
269,West Yorkshire derby,Leeds United,Halifax Town
270,West Yorkshire derby,Leeds United,Huddersfield Town
271,West Yorkshire derby,Bradford City,Halifax Town
272,West Yorkshire derby,Bradford City,Huddersfield Town


In [4]:
# Convert the teams query result to a DataFrame
teams_query = f"SELECT * FROM preparation_layer.teams"
teams_results = client.query(teams_query)
teams_df = teams_results.to_dataframe()
#display(teams_df)
# Function to perform fuzzy string matching
def fuzzy_match(rivalry_team_name, teams_df):
    best_match_score = 80
    best_match_team_id = -1
    for _, row in teams_df.iterrows():
        team_id = row['team_id']
        team_id_name = row['team_name']
        match_score = fuzz.ratio(rivalry_team_name.lower(), team_id_name.lower())
        if match_score > best_match_score:
            best_match_score = match_score
            best_match_team_id = team_id
    return best_match_team_id

# Apply fuzzy matching to the rivalries dataframe
rivalries_df['team1_id'] = rivalries_df['team1_name'].apply(lambda team1_name: fuzzy_match(team1_name, teams_df))
rivalries_df['team2_id'] = rivalries_df['team2_name'].apply(lambda team2_name: fuzzy_match(team2_name, teams_df))


# Display the updated rivalries dataframe
display(rivalries_df)



Unnamed: 0,rivalry_name,team1_name,team2_name,team1_id,team2_id
0,A420 derby,Oxford United,Swindon Town,6940,148
1,Aylesbury derby,Aylesbury United,Aylesbury Vale Dynamos,-1,-1
2,Bedford derby,Bedford Town,Real Bedford,-1,-1
3,Berkshire derby,Maidenhead United,Slough Town,13598,-1
4,Buckinghamshire derby,Milton Keynes Dons,Wycombe Wanderers,8201,4559
...,...,...,...,...,...
269,West Yorkshire derby,Leeds United,Halifax Town,4034,24
270,West Yorkshire derby,Leeds United,Huddersfield Town,4034,4594
271,West Yorkshire derby,Bradford City,Halifax Town,4547,24
272,West Yorkshire derby,Bradford City,Huddersfield Town,4547,4594


In [5]:
# Load data to Big Query

# Load rivalries to Big Query
# Specify the target table
load_dataset_name = 'extract_layer'
load_table_name = 'web_scrape_wikipedia_rivalries'
load_table_ref = f"{load_dataset_name}.{load_table_name}"

# Insert predictions for ratings model data to the existing table
job_config = bigquery.LoadJobConfig(
        write_disposition="WRITE_TRUNCATE")
load_job = client.load_table_from_dataframe(
        rivalries_df, load_table_ref, job_config=job_config)

load_job.result()  # Wait for the job to complete

num_rows_inserted = load_job.output_rows
print(f"{num_rows_inserted} rows appended to table {load_table_ref} successfully.")

274 rows appended to table extract_layer.web_scrape_wikipedia_rivalries successfully.
