In [None]:
!pip install icecream
!pip install pint
!pip install bs4

## This script was used to scrape the data 
for this [UFC Fighters' Stats dataset](https://www.kaggle.com/datasets/asaniczka/ufc-fighters-statistics).
This was initialy a regular python script with a bunch of modules.

But I tried my best to make a it a 1 page notebook on kaggle

In [None]:
import string
import datetime
import re
import os
import logging
import time
from typing import Optional, Union

import pint
from bs4 import BeautifulSoup
from icecream import ic
import requests

## Helper functions
These functions are common and reusable, so I'm not going to include them as part of the main script

In [None]:
def setup_logger(log_file_path: str) -> logging:
    """Set up a logger and return the logger instance.

    Args:
        log_file_path (str): The path of the log file.

    Returns:
        logging.Logger: The configured logger instance.

    Example Usage:
        `LOGGER = setup_logger("/path/to/log/file.log")`

        `LOGGER = setup_logger(log_file_path)`
    """

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # set the logging level to debug

    log_format = logging.Formatter(
        '%(asctime)s :   %(levelname)s   :   %(message)s')

    # init the console logger
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    stream_handler.setFormatter(log_format)  # add the format
    logger.addHandler(stream_handler)

    return logger


def setup_basic_file_paths(project_name: str) -> tuple[str, str, str, str, str]:
    """Sets up the project folder and creates log, data, and temp folders.
    Also creates the path to the log file.

    Args:
        project_name (str): The name of the project.

    Returns:
        tuple[str, str, str, str]: A tuple containing the paths of the project folder,
        data folder, temp folder, log folder, and log file, respectively.

    Example Usage:
        `project_folder, data_folder, temp_folder, log_folder, log_file_path = setup_basic_file_paths("MyProject")`
    """
    # pylint: disable=import-outside-toplevel
    import os

    # create the project folder
    cwd = os.getcwd()
    project_folder = os.path.join(cwd, project_name)
    os.makedirs(project_folder, exist_ok=True)

    # create the data folder
    data_folder = os.path.join(project_folder, 'data')
    os.makedirs(data_folder, exist_ok=True)

    # make the temp folder
    temp_folder = os.path.join(project_folder, 'temp')
    os.makedirs(temp_folder, exist_ok=True)

    # make the log folder and log file path
    log_folder = os.path.join(project_folder, 'logs')
    os.makedirs(log_folder, exist_ok=True)
    log_file_path = os.path.join(log_folder, f'{project_name}.log')

    return (project_folder, data_folder, temp_folder, log_folder, log_file_path)


def save_temp_file(
        temp_folder: str,
        file_name: str,
        content: str | set | list | dict,
        extionsion: str) -> None:
    """Saves the given content to a temporary file in the specified temp folder.

    Args:
        `temp_folder (str)`: The path to the temporary folder.
        `file_name (str)`: The name of the temporary file.
        `content (str | set | list | dict)`: The content to be written to the temporary file. 
            Lists,sets will be formatted with newlines
        `extension (str)`: The file extension of the temporary file.

    Returns:
        None

    Example Usage:
        `save_temp_file("/path/to/temp/folder", "example_file", "This is the file content", "txt")`
    """
    # pylint: disable=import-outside-toplevel
    import os

    # format the content to a string
    if isinstance(content, list):
        string_content = '\n'.join([str(item) for item in content])
    elif isinstance(content, set):
        string_content = '\n'.join([str(item) for item in content])
    elif isinstance(content, dict):
        # pylint: disable=import-outside-toplevel
        import json
        string_content = json.dumps(content)
    else:
        string_content = content

    # now save the temp file
    with open(os.path.join(temp_folder, f'{file_name}.{extionsion}'),
              'w', encoding='utf-8') as temp_file:
        temp_file.write(string_content)


def format_error(error: str) -> str:
    """Removes newlines from the given error string.

    Args:
        `error (str)`: The error string to be formatted.

    Returns:
        str: The formatted error string.

    Example Usage:
        `formatted_error = format_error(error)`
    """

    # remove newline characters
    error = str(error)
    formatted_error = error.replace('\n', '')

    return formatted_error


def basic_request(
        url: str,
        logger: Optional[Union[None, logging.Logger]] = None,
        logger_level_debug: Optional[bool] = False) -> str:
    """Makes a basic HTTP GET request to the given URL.

    Args:
        `url (str)`: The URL to make the request to.
        `logger (Optional:None, logging.Logger)`: The logger instance to log warnings. 
                (default: None)
        `logger_level_debug (Optional:bool)`: Whether to log warnings at debug level. 
                (default: False)

    Returns:
        str: The content of the response if the request was successful.

    Raises:
        RuntimeError: If the request failed after 5 retries.

    Example Usage:
        `response_content = basic_request("https://example.com", logger)`
    """

    retries = 0
    while retries < 5:
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0'
        }
        response = requests.get(url, headers=headers, timeout=5)

        if response.status_code == 200:
            # do the okay things
            content = response.text
            break

        # if not okay, then start logging and retrying
        if logger:
            # if logger level is said to be debug, do debug, otherwise it's a warning
            if logger_level_debug:
                logger.warning(
                    'Failed to get request. \
                    Status code %d, Response text: %s',
                    response.status_code, format_error(response.text))
            else:
                logger.warning(
                    'Failed to get request. \
                    Status code %d, Response text: %s',
                    response.status_code, format_error(response.text))

        # sleep 1 second and incrase retries
        time.sleep(1)
        retries += 1
        continue

    # raise an error if we tried more than 5 and still failed
    if retries >= 5:
        raise RuntimeError(f'No response from website. \
                            Last status code {response.status_code}, \
                            Response text: {format_error(response.text)}')

    return content


def save_ndjson(data: dict, file_path: str) -> None:
    """Saves the given data to the same ndjson file"""
    # pylint: disable=import-outside-toplevel
    import json

    with open(file_path, 'a', encoding='utf-8') as dump_file:
        dump_file.write(f'{json.dumps(data)}\n')


## Dev options

In [None]:
RUN_ONLY_ONE = False  # only extracts 1 iteraetion
RUN_25_ITR = True # Extracts 25 fighters for testing
IS_IC_DEBUG = False  # mark true to log ic statements

## Global variables

In [None]:
PROJECT_NAME = 'scrape_ufc_stats'
LOGGER = None

THis function extracts the figheter page links from a given html

In [None]:
def extract_fighter_pagelinks_from_serp(html: str) -> set[str]:

    soup = BeautifulSoup(html, 'html.parser')

    # extract all the links
    tags = soup.select('tr.b-statistics__table-row a')
    links = [tag.get('href') for tag in tags]

    unique_links = set(links)

    return unique_links

## THis function is used to handle getting all the links to the fighers
Returns a list of urls

In [None]:
def get_fighters() -> set[str]:
    LOGGER.info('Starting to get fighters from SERPS')

    # get all the letters in the alphabet
    letters = string.ascii_lowercase
    all_links = set()

    # get the fighter pages for each letter
    for letter in letters:
        LOGGER.info('Extracting letter %s', letter)

        # get the html for the letter
        url = f'http://ufcstats.com/statistics/fighters?char={letter}&page=all'
        try:
            html = basic_request(url, LOGGER)
        except RuntimeError:
            continue

        # now parse the html to extract the links to the fighters
        links = extract_fighter_pagelinks_from_serp(html)

        all_links.update(links)

        if RUN_ONLY_ONE:
            break

    LOGGER.info('Found %d unique links', len(all_links))

    return all_links

## Extracts the bio data from the given html

In [None]:
def extract_bio_data(soup: BeautifulSoup, fighter_name: str) -> dict:
    try:
        # convert the data card to a list and extract the data
        physicial_data = soup.select_one(
            '.b-list__info-box.b-list__info-box_style_small-width')\
            .get_text(strip=True, separator='_').split('_')
        if IS_IC_DEBUG:
            ic(physicial_data)
        try:
            height = physicial_data[1]
        except IndexError:
            height = None
        try:
            weight = physicial_data[3]
        except IndexError:
            weight = None
        try:
            reach = physicial_data[5]
        except IndexError:
            reach = None
        try:
            stance = physicial_data[7]
        except IndexError:
            stance = None
        try:
            dob = physicial_data[9]
        except IndexError:
            dob = None

        def format_weight_to_kg(weight: str) -> int | None:

            # get digits, if in pounds, convert to kg, else none
            digit_weight = re.match(r'(\d+)', weight).group(0)
            if digit_weight:
                if 'lbs' in weight:
                    weight_in_kg = round(int(digit_weight)*0.453592, 2)
            else:
                weight_in_kg = None

            return weight_in_kg

        # format weight to kg
        if weight and "--" != weight:
            weight_in_kg = format_weight_to_kg(weight)
        else:
            weight_in_kg = None

        # format the date of birth
        # pylint: disable=bare-except
        if dob and "--" != dob:
            try:
                date_of_birth = str(
                    datetime.datetime.strptime(dob, '%b %d, %Y').date())
            except:
                date_of_birth = None
        else:
            date_of_birth = None

        def convert_height_to_cm(height: str) -> int:
            # convert height to cm
            ureg = pint.UnitRegistry()
            h_feet, h_inches = height.split(' ')

            try:
                # use regex to extract the digits, convert to int and store as feet and inches
                h_feet, h_inches = int(re.match(
                    r'(\d+)', h_feet).group(0)), int(re.match(r'(\d+)', h_inches).group(0))

                # do conversion
                default_height = h_feet*ureg.foot + h_inches * ureg.inch
                height_cm = default_height.to(ureg.centimeter)

                height_cm = round(height_cm.magnitude, 2)
            except:
                height_cm = None

            return height_cm

        # convert height to cm
        if height and "--" != height:
            height_cm = convert_height_to_cm(height)
        else:
            height_cm = None

        def convert_reach_to_cm(reach: str) -> int | None:
            reach_in_inch = reach if reach != '--' else None

            if reach_in_inch:
                reach_in_cm = int(reach_in_inch.split('"')[0])*2.54
            else:
                reach_in_cm = None

            return reach_in_cm

        # convert reach to cm
        if reach and "--" != reach:
            reach_in_cm = convert_reach_to_cm(reach)
        else:
            reach_in_cm = None

    # pylint: disable=broad-except
    except Exception as error:
        LOGGER.warning('Exception for %s on extract_bio_data(): %s', fighter_name,
                       format_error(error))
        height_cm = weight_in_kg = reach_in_cm = stance = date_of_birth = None

    bio_data = {
        "height_cm": height_cm,
        "weight_in_kg": weight_in_kg,
        "reach_in_cm": reach_in_cm,
        "stance": stance if stance != '--' else None,
        "date_of_birth": date_of_birth
    }

    return bio_data

## Extracts the career data from the given html

In [None]:
def extract_career_data(soup: BeautifulSoup, fighter_name: str) -> dict:
    try:
        career_data = soup.select_one(
            '.b-list__info-box.b-list__info-box_style_middle-width')\
            .get_text(strip=True, separator='_').split('_')

        significant_strikes_landed_per_minute = float(career_data[2])
        significant_striking_accuracy = float(career_data[4].replace('%', ''))
        significant_strikes_absorbed_per_minute = float(career_data[6])
        significant_strike_defence = float(career_data[8].replace('%', ''))
        average_takedowns_landed_per_15_minutes = float(career_data[10])
        takedown_accuracy = float(career_data[12].replace('%', ''))
        takedown_defense = float(career_data[14].replace('%', ''))
        average_submissions_attempted_per_15_minutes = float(career_data[2])

    # pylint: disable=broad-except
    except Exception as error:
        LOGGER.warning('Exception for %s on extract_career_data(): %s', fighter_name,
                       format_error(error))
        significant_strikes_landed_per_minute = None
        significant_striking_accuracy = None
        significant_strikes_absorbed_per_minute = None
        significant_strike_defence = None
        average_takedowns_landed_per_15_minutes = None
        takedown_accuracy = None
        takedown_defense = None
        average_submissions_attempted_per_15_minutes = None

    career_dict = {
        'significant_strikes_landed_per_minute': significant_strikes_landed_per_minute,
        'significant_striking_accuracy': significant_striking_accuracy,
        'significant_strikes_absorbed_per_minute': significant_strikes_absorbed_per_minute,
        'significant_strike_defence': significant_strike_defence,
        'average_takedowns_landed_per_15_minutes': average_takedowns_landed_per_15_minutes,
        'takedown_accuracy': takedown_accuracy,
        'takedown_defense': takedown_defense,
        'average_submissions_attempted_per_15_minutes': average_submissions_attempted_per_15_minutes
    }

    return career_dict

## Extracts fighter data from the given html

In [None]:
def extract_fighter_data(fighter_html: str) -> dict:
    soup = BeautifulSoup(fighter_html, 'html.parser')

    try:
        fighter_name = soup.select_one(
            '.b-content__title-highlight').get_text(strip=True)
    except AttributeError:
        fighter_name = None

    try:
        win, loss, draw = soup.select_one(
            '.b-content__title-record')\
            .get_text(strip=True)\
            .split(' ', maxsplit=1)[-1]\
            .strip()\
            .split(' ')[0]\
            .strip()\
            .split('-')

        win = int(win)
        loss = int(loss)
        draw = int(draw)

    except (AttributeError, ValueError):
        win, loss, draw = None, None, None

    try:
        nickname = soup.select_one('.b-content__Nickname').get_text(strip=True)
        if "" == nickname:
            nickname = None
    except AttributeError:
        nickname = None

    bio_data = extract_bio_data(soup, fighter_name)
    career_data = extract_career_data(soup, fighter_name)

    fighter_data = {
        "name": fighter_name,
        "nickname": nickname,
        "wins": win,
        "losses": loss,
        "draws": draw,
        "height_cm": bio_data['height_cm'],
        "weight_in_kg": bio_data['weight_in_kg'],
        "reach_in_cm": bio_data['reach_in_cm'],
        "stance": bio_data['stance'],
        "date_of_birth": bio_data['date_of_birth'],
        'significant_strikes_landed_per_minute': career_data['significant_strikes_landed_per_minute'],
        'significant_striking_accuracy': career_data['significant_striking_accuracy'],
        'significant_strikes_absorbed_per_minute': career_data['significant_strikes_absorbed_per_minute'],
        'significant_strike_defence': career_data['significant_strike_defence'],
        'average_takedowns_landed_per_15_minutes': career_data['average_takedowns_landed_per_15_minutes'],
        'takedown_accuracy': career_data['takedown_accuracy'],
        'takedown_defense': career_data['takedown_defense'],
        'average_submissions_attempted_per_15_minutes': career_data['average_submissions_attempted_per_15_minutes'],

    }

    return fighter_data

## Load the fighter page and handles extracting fighter data

In [None]:
def get_n_extract_fighter_data(link: str) -> dict:
    # LOGGER.info('Processing fighter %s', link)

    # first get the page
    try:
        page = basic_request(link, LOGGER)
    except RuntimeError as error:
        raise RuntimeError from error

    # now parse the page:
    fighter_dict = extract_fighter_data(page)

    return fighter_dict

## This is the main handler for this script

In [None]:
def executor() -> None:
    # pylint: disable=global-statement
    global LOGGER

    # setup folders
    _, data_folder, _, _, log_file_path = setup_basic_file_paths(
        PROJECT_NAME)

    # setup a logger
    LOGGER = setup_logger(log_file_path)

    fighter_links = get_fighters()

    # ndjson save file for all the fighteres
    ndjson_file_path = os.path.join(data_folder, 'fighter_data.ndjson')

    # get data for each individual fighter
    for i, fighter in enumerate(fighter_links):
        LOGGER.info('Processing %d out of %d. Fighter url: %s',
                    i, len(fighter_links), fighter)
        try:
            fighter_data = get_n_extract_fighter_data(fighter)

            save_ndjson(fighter_data, ndjson_file_path)

            if RUN_ONLY_ONE:
                break
            if RUN_25_ITR and i==24:
                break
        except RuntimeError:
            LOGGER.debug(
                'Skipping a fighter due to a requests error: %s', fighter)

            if RUN_ONLY_ONE:
                break
            if RUN_25_ITR and i==24:
                break
            continue
    LOGGER.info('Finished the executor')

In [None]:
executor()

## This script only ran 25 iteration
Since I already have the data extracted. 

Feel free to use this script to learn how to create a scraper

<img src="https://i.giphy.com/5K7ngCtszoxxbaBieC.gif" style="margin:auto;width:40%;height:auto;">