## Experiment part

In [4]:
# import modules
import pandas as pd
from bs4 import BeautifulSoup
import requests
import urllib.request


In [5]:
sraNumber = '509760' #DAC BEACHCROFT CLAIMS LIMITED
org_names = []
sra_regs = []
num_people_org = []
fee_earner = []
non_fee_earner = []
no_fe = []
no_non_fe = []

In [6]:
with requests.Session() as s:
    home_page = s.get(f'https://www.sra.org.uk/consumers/register/organisation/?sraNumber={sraNumber}')
    home_soup = BeautifulSoup(home_page.content)
   
    #get org name, whether it's sra regulated, and number of people
    org_name = str(home_soup.find('h1', {'class': 'reg__detail__h1'}).string)
    #print(org_name)
    org_names.append(org_name)
    sra_reg = str(home_soup.find('span', {'class': 'GlossaryLink'}).string)
    sra_regs.append(sra_reg)
    num_people = str(home_soup.find('div',{'id':'headingPracticePeople'}).find('span').string)[1:-1]
    num_people_org.append(num_people)
    #print(num_people)
    
    #check whether all the people are loaded in one page
    show_more = home_soup.find('div', {'class': 'lookup__search__result__listctrl__btn'})
    if show_more == None:
        table = home_soup.find('ul', {'class': 'lookup__person__result__list'})
        table_rows = table.findAll('li')
        for tr in table_rows:
            role=str(tr.find('p').string)
            if role == 'SRA-regulated solicitor':
                fee_earner.append(role)
            else:
                non_fee_earner.append(role)
    else:
        people_page = s.get(f'https://www.sra.org.uk/consumers/register/organisation/GetPeople/?numberOfResults={num_people}')
        people_soup = BeautifulSoup(people_page.content)
        table = people_soup.find('ul', {'class': 'lookup__person__result__list'})
        table_rows = table.findAll('li')
        for tr in table_rows:
            role=str(tr.find('p').string).strip()
            if role == 'SRA-regulated solicitor':
                fee_earner.append(role)
            else:
                non_fee_earner.append(role)
    
    no_fe.append(len(fee_earner))
    no_non_fe.append(len(non_fee_earner))

    s.close()

DAC BEACHCROFT CLAIMS LIMITED


In [7]:
data =  {'organisation':org_names,
        'sra_regulation':sra_regs,
        'num_people':num_people_org,
        'num_fee_earners':no_fe,
        'num_non_fee_earners':no_non_fe}
data
df = pd.DataFrame(data)
df

Unnamed: 0,organisation,sra_regulation,num_people,num_fee_earners,num_non_fee_earners
0,DAC BEACHCROFT CLAIMS LIMITED,SRA-regulated firm,364,362,2


## Full Code version by peer

In [1]:
class OrganisationNotFound(Exception):
    pass

In [2]:
SRA_NUMBERS = []
ORG_NAMES = []
SRA_REGS = []
NUM_PEOPLE_ORG = []
NUM_FEE_EARNERS = []
NUM_NON_FEE_EARNERS = []

In [5]:
data =  {
    'sra_numbers': SRA_NUMBERS,
    'organisation': ORG_NAMES,
    'sra_regulation': SRA_REGS,
    'num_people': NUM_PEOPLE_ORG,
    'num_fee_earners': NUM_FEE_EARNERS,
    'num_non_fee_earners': NUM_NON_FEE_EARNERS
}
df = pd.DataFrame(data)
df.astype({'num_people': int, 'num_fee_earners': int, 'num_non_fee_earners': int}).dtypes

sra_numbers            float64
organisation           float64
sra_regulation         float64
num_people               int32
num_fee_earners          int32
num_non_fee_earners      int32
dtype: object

In [6]:
def count_solicitors(table_rows):
    fee_earners = 0
    non_fee_earners = 0
    for tr in table_rows:
        role = tr.find('p').text.strip()
        if role == 'SRA-regulated solicitor':
            fee_earners += 1
        else:
            non_fee_earners += 1
    
    return fee_earners, non_fee_earners

def get_page_data(soup, tag, class_=None, id_=None):
    selector = 'class' if class_ is not None else 'id'
    return str(soup.find(tag, {selector: class_}).string)

def get_data_by_sra(df, sra_number):
    """
    Get the data required to add a new row of data to the dataframe.
    
    Parameters
    ----------
    sra_number: str or int
        SRA Number to lookup
    """
    with requests.Session() as s:
        home_page = s.get(f'https://www.sra.org.uk/consumers/register/organisation/?sraNumber={sra_number}', allow_redirects=True)
        home_soup = BeautifulSoup(home_page.content)
        
        page_title = home_soup.find('title')
        if page_title.get_text().startswith("SRA | 500 Internal Server Error"):
            raise OrganisationNotFound(f"Organisation with SRA Number {sra_number} does not exist.")
        # get org name, whether it's sra regulated, and number of people
        org_name = get_page_data(home_soup, 'h1', class_='reg__detail__h1')
        sra_reg = get_page_data(home_soup, 'span', class_='GlossaryLink')
        num_people = str(home_soup.find('div',{'id':'headingPracticePeople'}).find('span').string)[1:-1] # e.g.: returns 360 instead of (360)
        #check whether all the people are loaded in one page
        show_more = home_soup.find('div', {'class': 'lookup__search__result__listctrl__btn'})
        if show_more is None:
            table_rows = home_soup.find('ul', {'class': 'lookup__person__result__list'}).findAll('li')
        else:
            people_page = s.get(f'https://www.sra.org.uk/consumers/register/organisation/GetPeople/?numberOfResults={num_people}',allow_redirects=True)
            people_soup = BeautifulSoup(people_page.content)
            table_rows = people_soup.find('ul', {'class': 'lookup__person__result__list'}).findAll('li')
        fee_earners, non_fee_earners = count_solicitors(table_rows)
        s.close()
    
    return pd.Series([sra_number, org_name, sra_reg, num_people, fee_earners, non_fee_earners], index=df.columns)

In [None]:
for i in range(1, 20):
    print(i)
    try:
        df = df.append(get_data_by_sra(df, i), ignore_index=True)
        continue
df