In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

all_hyperlinks = []

for page_num in range(1, 3):  #this way it goes thru both pages without having two diff links
    url = f"https://www.lse.ac.uk/student-life/accommodation/search-accommodation?collection=lse-accommodation&pageIndex={page_num}&sort=metaavailability"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accommodation_titles = soup.find_all('h2', class_='card__title')
        hyperlinks = []
        for title in accommodation_titles:
            hyperlink = title.find('a')['href']
            hyperlinks.append(hyperlink)
        all_hyperlinks.extend(hyperlinks)

    except Exception as e:
        print("An error occurred:", e)

print(all_hyperlinks)

['http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/high-holborn-residence/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/urbanest-westminster-bridge/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/lilian-knowles-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/passfield-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/nutford-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/r

In [2]:
accommodation_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        roomtype = soup.find_all('h2', class_='roomlist__title')
        room_texts = [room.text for room in roomtype]
        accommodation_info.append({
            'hyperlink': hyperlink,
            'accomodation_title': accomodation_title.text.strip(),
            'room_type': room_texts
        })
            
for info in accommodation_info:
    print("Hyperlink:", info['hyperlink'])
    print("Name:", info['accomodation_title'])
    print("Room Type:", info['room_type'])

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Name: College Hall
Room Type: ['Single room ', 'Single en suite room ', 'Double en suite room ']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Name: International Hall
Room Type: ['Single room', 'Single studio', 'Double studio']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Name: Butler's Wharf Residence
Room Type: ['Single room', 'Twin room', 'Double room']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Name: Bankside House
Room Type: ['Single room', 'Single en suite room', 'Twin en suite']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Name: Carr-Saunders Hall
Room Type: ['Single room', 'Single with queen bed', 'Twin room', 'Twin en suite room']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/

In [3]:
room_df = pd.DataFrame(accommodation_info)
room_df_exploded = room_df.explode('room_type').reset_index(drop=True)
room_df_exploded

Unnamed: 0,hyperlink,accomodation_title,room_type
0,http://www.lse.ac.uk/student-life/accommodatio...,College Hall,Single room
1,http://www.lse.ac.uk/student-life/accommodatio...,College Hall,Single en suite room
2,http://www.lse.ac.uk/student-life/accommodatio...,College Hall,Double en suite room
3,http://www.lse.ac.uk/student-life/accommodatio...,International Hall,Single room
4,http://www.lse.ac.uk/student-life/accommodatio...,International Hall,Single studio
5,http://www.lse.ac.uk/student-life/accommodatio...,International Hall,Double studio
6,http://www.lse.ac.uk/student-life/accommodatio...,Butler's Wharf Residence,Single room
7,http://www.lse.ac.uk/student-life/accommodatio...,Butler's Wharf Residence,Twin room
8,http://www.lse.ac.uk/student-life/accommodatio...,Butler's Wharf Residence,Double room
9,http://www.lse.ac.uk/student-life/accommodatio...,Bankside House,Single room


In [4]:
bathroom_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        bathroom = soup.find_all('p', class_="roomlist__position")
        bathroom_texts = [room.text for room in bathroom]
        bathroom_info.append({
            'hyperlink': hyperlink,
            'bathroom_types': bathroom_texts
        })

result = []
for item in bathroom_info:
    hyperlink = item['hyperlink']
    even_indexed_types = [item['bathroom_types'][i] for i in range(len(item['bathroom_types'])) if i % 2 == 0]
    result.append({'hyperlink': hyperlink, 'even_indexed_types': even_indexed_types})

for info in result:
    print("Hyperlink:", info['hyperlink'])
    print("Bathroom Type:", info['even_indexed_types'])

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Bathroom Type: ['Shared bathroom', 'Shared bathroom', 'Shared bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Shared bathroom', 'Shared bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx
Bathroom Type: ['Shared bathroom']
Hyperlink: http://w

In [5]:
bathroom_df = pd.DataFrame(result)
bathroom_df_exploded = bathroom_df.explode('even_indexed_types').reset_index(drop=True)
bathroom_df_exploded

Unnamed: 0,hyperlink,even_indexed_types
0,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom
1,http://www.lse.ac.uk/student-life/accommodatio...,Private bathroom
2,http://www.lse.ac.uk/student-life/accommodatio...,Private bathroom
3,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom
4,http://www.lse.ac.uk/student-life/accommodatio...,Private bathroom
5,http://www.lse.ac.uk/student-life/accommodatio...,Private bathroom
6,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom
7,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom
8,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom
9,http://www.lse.ac.uk/student-life/accommodatio...,Shared bathroom


In [6]:
combined_df = pd.merge(room_df_exploded, bathroom_df_exploded, how='left', left_index=True, right_index=True)
combined_df.drop(['hyperlink_x', 'hyperlink_y'], axis=1, inplace=True)
combined_df['even_indexed_types']=combined_df['even_indexed_types'].str.replace(' and kitchen','')
combined_df

Unnamed: 0,accomodation_title,room_type,even_indexed_types
0,College Hall,Single room,Shared bathroom
1,College Hall,Single en suite room,Private bathroom
2,College Hall,Double en suite room,Private bathroom
3,International Hall,Single room,Shared bathroom
4,International Hall,Single studio,Private bathroom
5,International Hall,Double studio,Private bathroom
6,Butler's Wharf Residence,Single room,Shared bathroom
7,Butler's Wharf Residence,Twin room,Shared bathroom
8,Butler's Wharf Residence,Double room,Shared bathroom
9,Bankside House,Single room,Shared bathroom


In [7]:
price_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        room_divs = soup.find_all('div', class_='roomataGlance')
        for room_div in room_divs:
            figure_span = room_div.find(class_='roomataGlance__figure')
            figure_text = figure_span.get_text(strip=True)
            price_info.append({'price': figure_text,
                              'accomodation_title': accomodation_title.text.strip()})
            
for info in price_info:
    print("Name:", info['accomodation_title'])
    print("Price:", info['price'])

Name: College Hall
Price: 289.73p/wk
Name: College Hall
Price: 332.43p/wk
Name: College Hall
Price: 392.63p/wk
Name: International Hall
Price: 266.28p/wk
Name: International Hall
Price: 294.98

-£301.28p/wk
Name: International Hall
Price: 321.93p/wk
Name: Butler's Wharf Residence
Price: 194.60

-£267.40p/wk
Name: Butler's Wharf Residence
Price: 127.40

-£153.65p/wk
Name: Butler's Wharf Residence
Price: 278.95p/wk
Name: Bankside House
Price: 259.70p/wk
Name: Bankside House
Price: 277.20

-£297.15p/wk
Name: Bankside House
Price: 167.65

-£186.20p/wk
Name: Carr-Saunders Hall
Price: 257.25p/wk
Name: Carr-Saunders Hall
Price: 259.35p/wk
Name: Carr-Saunders Hall
Price: 173.25p/wk
Name: Carr-Saunders Hall
Price: 185.50p/wk
Name: Connaught Hall
Price: 273.63p/wk
Name: High Holborn Residence
Price: 317.80p/wk
Name: High Holborn Residence
Price: 333.90p/wk
Name: High Holborn Residence
Price: 333.20p/wk
Name: High Holborn Residence
Price: 184.45p/wk
Name: urbanest Westminster Bridge
Price: 285.23

In [8]:
price_df = pd.DataFrame(price_info)
price_df

Unnamed: 0,price,accomodation_title
0,289.73p/wk,College Hall
1,332.43p/wk,College Hall
2,392.63p/wk,College Hall
3,266.28p/wk,International Hall
4,294.98\n\n-£301.28p/wk,International Hall
5,321.93p/wk,International Hall
6,194.60\n\n-£267.40p/wk,Butler's Wharf Residence
7,127.40\n\n-£153.65p/wk,Butler's Wharf Residence
8,278.95p/wk,Butler's Wharf Residence
9,259.70p/wk,Bankside House


In [9]:
combined_df1=pd.merge(combined_df, price_df, how='left', left_index=True, right_index=True)
combined_df1['price'] = combined_df1['price'].str.replace('p/wk', '')
combined_df1['price'] = combined_df1['price'].str.replace('\n\n', '').str.replace('£', '')
combined_df1 = combined_df1.T.drop_duplicates().T
combined_df1

Unnamed: 0,accomodation_title_x,room_type,even_indexed_types,price
0,College Hall,Single room,Shared bathroom,289.73
1,College Hall,Single en suite room,Private bathroom,332.43
2,College Hall,Double en suite room,Private bathroom,392.63
3,International Hall,Single room,Shared bathroom,266.28
4,International Hall,Single studio,Private bathroom,294.98-301.28
5,International Hall,Double studio,Private bathroom,321.93
6,Butler's Wharf Residence,Single room,Shared bathroom,194.60-267.40
7,Butler's Wharf Residence,Twin room,Shared bathroom,127.40-153.65
8,Butler's Wharf Residence,Double room,Shared bathroom,278.95
9,Bankside House,Single room,Shared bathroom,259.70


In [10]:
contract_costs=[]
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            if "Contract cost" in p.get_text():
                contract_cost = p.get_text().split('&pound;')[-1].strip()
                contract_costs.append({'cost': contract_cost,
                                       'accomodation_title': accomodation_title.text.strip()})
for info in contract_costs:
    print("Name:", info['accomodation_title'])
    print("Cost:", info['cost'])

Name: College Hall
Cost: Contract cost: £11,547.81Contract length: 40 weeksSize approx: 12m²
Name: College Hall
Cost: Contract cost: £13,249.71Contract length: 40 weeksSize approx: 14m²
Name: College Hall
Cost: Contract cost: £15,649.11Contract length: 40 weeksSize approx: 17m²
Name: International Hall
Cost: Contract cost: £10,613.16Contract length: 40 weeksSize approx: 8.95m²
Name: International Hall
Cost: Contract cost: £11,757.06 to £12,008.16Contract length: 40 weeksSize approx: 16.4m² to 18.9m²
Name: International Hall
Cost: Contract cost: £12,831.21Contract length: 40 weeksSize approx: 18.9m²
Name: Butler's Wharf Residence
Cost: Contract cost: £9,702.20 to £13,331.80Contract length: 50 weeksSize approx: 10m²
Name: Butler's Wharf Residence
Cost: Contract cost: £6,351.80 to £7,660.55Contract length: 50 weeksSize approx: 14m²
Name: Butler's Wharf Residence
Cost: Contract cost: £13,907.65Contract length: 50 weeksSize approx: 15m²
Name: Bankside House
Cost: Contract cost: £10,091.20Co

In [11]:
contract = pd.DataFrame(contract_costs)
data1 = contract['cost'].str.extract(r'Contract cost(.*?)Contract')
data2 = contract['cost'].str.extract(r'Contract length: (.*?)Size')
data3 = contract['cost'].str.extract(r'Size approx: (.*)')

contract['Contract cost'] = data1[0]
contract['Contract length'] = data2[0]
contract['Size approx'] = data3[0]

contract['Contract cost'] = contract['Contract cost'].str.replace(':', '').str.replace('£', '').str.replace('to', '-')
contract['Size approx'] = contract['Size approx'].str.replace('m²', '').str.replace('to', '-').str.replace('m2', '').str.replace('m', '')
contract['Contract length'] = contract['Contract length'].str.replace(' weeks', '')

contract.drop(columns=['cost'], inplace=True)

contract['Contract cost'][45] = '13,210.65'
contract['Size approx'][45] = '22.5'

contract['Contract cost'][42] = '11,076.30'
contract['Size approx'][42] = '12.1'

contract['Size approx'][34] = '10.5 - 11'
contract['Size approx'][35] = '15.5 - 17.5'
contract['Size approx'][36] = '17.5'
contract['Contract cost'][34] = '9,628.80 - 10,254.40'

contract['Size approx'][21] = '8.5'
contract['Size approx'][22] = '13.4'
contract['Size approx'][23] = '25.3'

contract['Contract length'][34] = '39'
contract['Contract length'][35] = '39'
contract['Contract length'][36] = '39'

contract['Contract length'][42] = '40'
contract['Contract length'][45] = '40'

new_row = {'accomodation_title': 'Nutford House', 'Contract cost': '7,309.80', 'Contract length': '40','Size approx': '6.4'}
index_location = 34
contract=pd.concat([contract.iloc[:index_location], pd.DataFrame([new_row]), contract.iloc[index_location:]]).reset_index(drop=True)

contract

Unnamed: 0,accomodation_title,Contract cost,Contract length,Size approx
0,College Hall,11547.81,40.0,12
1,College Hall,13249.71,40.0,14
2,College Hall,15649.11,40.0,17
3,International Hall,10613.16,40.0,8.95
4,International Hall,"11,757.06 - 12,008.16",40.0,16.4 - 18.9
5,International Hall,12831.21,40.0,18.9
6,Butler's Wharf Residence,"9,702.20 - 13,331.80",50.0,10
7,Butler's Wharf Residence,"6,351.80 - 7,660.55",50.0,14
8,Butler's Wharf Residence,13907.65,50.0,15
9,Bankside House,10091.20,39.0,12.5


In [12]:
df=pd.merge(combined_df1, contract, how='left', left_index=True, right_index=True)
df = df.T.drop_duplicates().T
indices_to_duplicate = range(21, 28)
rows_to_duplicate = df.loc[indices_to_duplicate]
df = pd.concat([df, rows_to_duplicate], ignore_index=True)

df['Contract length'][25] = '39'
df['Contract cost'][25] = '7,727.13 - 9,417.03'

df['Contract length'][51] = '50'
df['Contract cost'][51] = '9,914.59.47 - 12,082.88'

df['Contract length'][26] = '50'
df['Contract cost'][26] = '13,141.84 - 15,114.19'

df['Contract length'][27] = '50'
df['Contract cost'][27] = '15,114.19 - 16,773.44'

df['Contract length'][21] = '39'
df['Contract cost'][21] = '11,123.97 - 12,119.64'

df['Contract length'][47] = '50'
df['Contract cost'][47] = '14,412.50'

df['Contract length'][22] = '39'
df['Contract cost'][22] = '12,224.55 - 12,722.58'

df['Contract length'][48] = '50'
df['Contract cost'][48] = '14,345.00 - 17,345.00'

df['Contract length'][23] = '39'
df['Contract cost'][23] = '£8,856.51 - 9,368.58'

df['Contract length'][49] = '50'
df['Contract cost'][49] = '11,808.50'

df['Contract length'][24] = '50'
df['Contract cost'][24] = '21,001.50 - 22,916.00'

indices_to_drop = [50, 52, 53]
df.drop(indices_to_drop, inplace=True)

In [13]:
row_to_move = df.iloc[47]
df = df.drop(47)
df = pd.concat([df.iloc[:22], pd.DataFrame([row_to_move]), df.iloc[22:]], ignore_index=True)

In [14]:
row_to_move = df.iloc[48]
df = df.drop(48)
df = pd.concat([df.iloc[:24], pd.DataFrame([row_to_move]), df.iloc[24:]], ignore_index=True)

In [15]:
row_to_move = df.iloc[49]
df = df.drop(49)
df = pd.concat([df.iloc[:26], pd.DataFrame([row_to_move]), df.iloc[26:]], ignore_index=True)

In [16]:
row_to_move = df.iloc[50]
df = df.drop(50)
df = pd.concat([df.iloc[:29], pd.DataFrame([row_to_move]), df.iloc[29:]], ignore_index=True)

In [18]:
df.rename(columns={'accomodation_title_x': 'Name'}, inplace=True)
df.rename(columns={'room_type': 'Room Type'}, inplace=True)
df.rename(columns={'even_indexed_types': 'Bathroom Type'}, inplace=True)
df.rename(columns={'price': 'Price(£/week)'}, inplace=True)
df.rename(columns={'Contract cost': 'Contract Cost(£)'}, inplace=True)
df.rename(columns={'Contract length': 'Contract Length(week)'}, inplace=True)
df.rename(columns={'Size approx': 'Size Approximation(m²)'}, inplace=True)
df

Unnamed: 0,Name,Room Type,Bathroom Type,Price(£/week),Contract Cost(£),Contract Length(week),Size Approximation(m²)
0,College Hall,Single room,Shared bathroom,289.73,11547.81,40,12
1,College Hall,Single en suite room,Private bathroom,332.43,13249.71,40,14
2,College Hall,Double en suite room,Private bathroom,392.63,15649.11,40,17
3,International Hall,Single room,Shared bathroom,266.28,10613.16,40,8.95
4,International Hall,Single studio,Private bathroom,294.98-301.28,"11,757.06 - 12,008.16",40,16.4 - 18.9
5,International Hall,Double studio,Private bathroom,321.93,12831.21,40,18.9
6,Butler's Wharf Residence,Single room,Shared bathroom,194.60-267.40,"9,702.20 - 13,331.80",50,10
7,Butler's Wharf Residence,Twin room,Shared bathroom,127.40-153.65,"6,351.80 - 7,660.55",50,14
8,Butler's Wharf Residence,Double room,Shared bathroom,278.95,13907.65,50,15
9,Bankside House,Single room,Shared bathroom,259.70,10091.20,39,12.5


In [19]:
df.to_csv('contract_data.csv', index=False)