# TOC

- [Libraries](#libraries)
- [Scraping STANOX codes](#scraping-stanox-codes)
   * [Scrape Railway Codes pages](#scrape-railway-codes-pages)
   * [Clean Stanox Mapping Dataframe](#clean-stanox-mapping-dataframe)
   * [Save Stanox Mapping Dictionary to .CSV](#save-stanox-mapping-dictionary-to-csv)
   * [Package code into functions](#package-code-into-functions)
- [Scraping TOC codes](#scraping-toc-codes)
   * [Scrape Railway Codes pages](#scrape-railway-codes-pages-1)
   * [Clean TOC Codes Mapping Dataframe](#clean-toc-codes-mapping-dataframe)
   * [Save TOC Codes Mapping to .CSV](#save-toc-codes-mapping-to-csv)
   * [Package code into functions](#package-code-into-functions-1)


<!-- TOC --><a name="libraries"></a>
# Libraries


In [5]:
import os
import string
import requests
import pandas as pd
from bs4 import BeautifulSoup


<!-- TOC --><a name="scraping-stanox-codes"></a>
# Scraping STANOX codes

<!-- TOC --><a name="scrape-railway-codes-pages"></a>
## Scrape Railway Codes pages

In [24]:
# initialize list of dfs to fill with each letter dataframe
list_dfs = []
for letter in list(string.ascii_lowercase):
    url = f"http://www.railwaycodes.org.uk/crs/crs{letter}.shtm"
    response = requests.get(url=url, headers={"Accept-Language":"en-US"})
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # in each page select the table identified by the id "tablesort"
        table = soup.find('table', {'id': 'tablesort'})
        # initialize empty data and headers list
        data = []
        headers = []
        # extract header row
        header_row = table.find('tr')
        for th in header_row.find_all('th'):
            headers.append(th.text.strip())
        # extract rows
        for row in table.find_all('tr')[1:]:
            row_data = [td.text.strip() for td in row.find_all('td')]
            data.append(row_data)
        # create df
        df = pd.DataFrame(data, columns=headers)
        list_dfs.append(df)
    else:
        print(f"Letter {url} could not be scraped.")

stanox_mapping_df = pd.concat(list_dfs)


<!-- TOC --><a name="clean-stanox-mapping-dataframe"></a>
## Clean Stanox Mapping Dataframe

In [42]:
#drop rows where Stanox codes are null
null_mask = stanox_mapping_df['STANOX'].isnull()
stanox_mapping_df = stanox_mapping_df[~null_mask]
print(f"Null Values: Dropping {stanox_mapping_df[null_mask].shape[0]} rows")
#drop rows where Stanox codes are empty
empty_mask = stanox_mapping_df['STANOX'] == ""
stanox_mapping_df = stanox_mapping_df[~empty_mask]
print(f"Empty Values: Dropping {stanox_mapping_df[empty_mask].shape[0]} rows")
#drop redundant cols
stanox_mapping_df = stanox_mapping_df[['Location', 'STANOX']]


Null Values: Dropping 0 rows


  print(f"Empty Values: Dropping {stanox_mapping_df[empty_mask].shape[0]} rows")


Empty Values: Dropping 0 rows


<!-- TOC --><a name="save-stanox-mapping-dictionary-to-csv"></a>
## Save Stanox Mapping Dictionary to .CSV

In [47]:
stanox_mapping_df.to_csv("../raw_data/stanox_locations_mapping.csv")


<!-- TOC --><a name="package-code-into-functions"></a>
## Package code into functions

In [111]:
def scrape_stanox_codes():
    """
    Scrapes CRS, NLC, TIPLOC and STANOX Codes from http://www.railwaycodes.org.uk/.
    Returns:
        dataframe : dataframe containing the output scraped from all the letters.
    """
    # create a list of all letters
    letters = list(string.ascii_lowercase)
    # initialize list of dfs to fill with each letter dataframe
    list_dfs = []
    for letter in letters:
        # for each letter build a custom URL
        url = f"http://www.railwaycodes.org.uk/crs/crs{letter}.shtm"
        # send request
        response = requests.get(url=url, headers={"Accept-Language":"en-US"})
        # if the request is successful
        if response.status_code == 200:
            # parse html page
            soup = BeautifulSoup(response.content, "html.parser")
            # select the table identified by the id "tablesort" from the soup
            table = soup.find('table', {'id': 'tablesort'})
            # initialize empty data and headers list
            data = []
            headers = []
            # extract header row
            header_row = table.find('tr')
            for th in header_row.find_all('th'):
                headers.append(th.text.strip())
            # extract rows
            for row in table.find_all('tr')[1:]:
                row_data = [td.text.strip() for td in row.find_all('td')[:6]]
                data.append(row_data)
            # create df
            df = pd.DataFrame(data, columns=headers)
            # append df to list of dfs
            list_dfs.append(df)
        else:
            # print the status code if the request failed
            print(f"Letter {url} could not be scraped. Status Code: {response.status_code}")
    # return a dataframe with all the information scraped for each page
    return pd.concat(list_dfs)

def clean_stanox_mapping_df(df):
    """
    Cleans a dataframe obtained from scraping stanox codes.
    Args:
        df (dataframe) = DataFrame obtained by scraping stanox codes.

    Returns:
        dataframe : cleaned dataframe containing only the Location and STANOX columns.
    """
    #drop rows where Stanox codes are null
    null_mask = (df['STANOX'].isnull())
    df = df[~null_mask]
    #drop rows where Stanox codes are empty
    empty_mask = (df['STANOX'] == "")
    df = df[~empty_mask]
    # drop rows where Stanox codes are "-""
    invalid_mask = (df['STANOX'] == "-")
    df = df[~invalid_mask]
    #drop redundant cols
    df = df[['Location', 'STANOX']]
    return df

def create_stanox_location_mapping(clean = True, to_csv = True):
    """
    Creates a dataframe mapping STANOX codes and locations name.
    Args:
        clean (bool) = If true, rows with empty and null STANOX codes are dropped.
                        Only STANOX and Location columns are kept.
        to_csv (bool) = If true, the dataframe is exported in CSV to the ../raw_data folder.
                        If folder does not exists, it is created.
    Returns:
        dataframe : cleaned dataframe containing only the Location and STANOX columns.
    """
    # create a dataframe with all the data scraped from the http://www.railwaycodes.org.uk/
    df = scrape_stanox_codes()
    # if param clean is true, it removes empty, invalid and null STANOX codes. Keep only Location and Stanox Code column.
    if clean == True:
        df = clean_stanox_mapping_df(df)
    if to_csv == True:
    # if param csv is true, dataframe is also saved as a .csv in the ../raw_data folder.
        folder_path = "../raw_data"
        file_name = "stanox_locations_mapping.csv"
        # if raw_data folder does not exists, create it.
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        df.to_csv(os.path.join(folder_path, file_name))
        print(f"{file_name} saved in {folder_path}")
    return df


<!-- TOC --><a name="scraping-toc-codes"></a>
# Scraping TOC codes

<!-- TOC --><a name="scrape-railway-codes-pages-1"></a>
## Scrape Railway Codes pages

In [6]:
url = "http://www.railwaycodes.org.uk/operators/toccodes.shtm"
response = requests.get(url=url, headers={"Accept-Language":"en-US"})
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    # in each page select the table identified by the id "tablesort"
    table = soup.find('table', {'id': 'tablesort'})
    # initialize empty data and headers list
    data = []
    headers = []
    # extract header row
    header_row = table.find('tr')
    for th in header_row.find_all('th'):
        headers.append(th.text.strip())
    # extract rows
    for row in table.find_all('tr')[1:]:
        row_data = [td.text.strip() for td in row.find_all('td')]
        data.append(row_data)
    # create df
    df = pd.DataFrame(data, columns=headers)
else:
    print(response.status_code)


<!-- TOC --><a name="clean-toc-codes-mapping-dataframe"></a>
## Clean TOC Codes Mapping Dataframe

In [19]:
# Keep only Code and Train Operator columns
df = df[['Code','Train operator']]
df['Code'].unique()

array(['AM', 'AN', 'AR', 'AW', 'ATW', 'CA', 'CC', 'CH', 'CS', 'CT', 'CX',
       'DC', 'EC', 'EM', 'EP', 'ES', 'EU', 'FC', 'GC', 'GE', 'GL', 'GM',
       'GN', 'GR', 'GW', 'GWR', 'GX', 'HB', 'HC', 'HT', 'HX', 'IL', 'LD',
       'LE', 'LF', 'LG', 'LM', 'LN', 'LO', 'LR', 'LS', 'LT', 'ME', 'ML',
       'MV', 'NL', 'NR', 'NS', 'NT', 'NW', 'NY', 'QC', 'RE', 'RT', 'SC',
       'SE', 'SJ', 'SN', 'SO', 'SP', 'SR', 'SS', 'SW', 'SX', 'TL', 'TP',
       'TS', 'TT', 'TW', 'VL', 'VT', 'WB', 'WC', 'WE', 'WM', 'WN', 'WR',
       'WS', 'WW', 'XC', 'XM', 'XP', 'XR', 'XS', 'XX', 'YG', 'ZZ', '#|'],
      dtype=object)

<!-- TOC --><a name="save-toc-codes-to-csv"></a>
## Save TOC Codes Mapping to .CSV


In [125]:
df.to_csv("../raw_data/toc_operators_mapping.csv")


<!-- TOC --><a name="package-code-into-functions-1"></a>
## Package code into functions

In [2]:
def create_toc_operator_mapping(clean = True, to_csv = True):
    """
    Creates a dataframe mapping STANOX codes and locations name.
    Args:
        clean (bool) = If true, only Code and Train operators columns are kept.
        to_csv (bool) = If true, the dataframe is exported in CSV to the ../raw_data folder.
                        If folder does not exists, it is created.
    Returns:
        dataframe : cleaned dataframe containing only the Code and Train operators columns.
    """
    url = "http://www.railwaycodes.org.uk/operators/toccodes.shtm"
    response = requests.get(url=url, headers={"Accept-Language":"en-US"})
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        # in each page select the table identified by the id "tablesort"
        table = soup.find('table', {'id': 'tablesort'})
        # initialize empty data and headers list
        data = []
        headers = []
        # extract header row
        header_row = table.find('tr')
        for th in header_row.find_all('th'):
            headers.append(th.text.strip())
        # extract rows
        for row in table.find_all('tr')[1:]:
            row_data = [td.text.strip() for td in row.find_all('td')]
            data.append(row_data)
        # create df
        df = pd.DataFrame(data, columns=headers)
    else:
        print(response.status_code)
    if clean == True:
        df = df[['Code','Train operator']]
    if to_csv:
    # if param csv is true, dataframe is also saved as a .csv in the ../raw_data folder.
        folder_path = "../raw_data"
        file_name = "toc_operators_mapping.csv"
        # if raw_data folder does not exists, create it.
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        df.to_csv(os.path.join(folder_path, file_name))
        print(f"{file_name} saved in {folder_path}")
    return df
