## Scraping for all hyperlinks

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

all_hyperlinks = []

for page_num in range(1, 3):  
    url = f"https://www.lse.ac.uk/student-life/accommodation/search-accommodation?collection=lse-accommodation&pageIndex={page_num}&sort=metaavailability"
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accommodation_titles = soup.find_all('h2', class_='card__title')
        hyperlinks = []
        for title in accommodation_titles:
            hyperlink = title.find('a')['href']
            hyperlinks.append(hyperlink)
        all_hyperlinks.extend(hyperlinks)

    except Exception as e:
        print("An error occurred:", e)

print(all_hyperlinks)

['http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/high-holborn-residence/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/urbanest-westminster-bridge/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/lilian-knowles-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/passfield-hall/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/nutford-house/home.aspx', 'http://www.lse.ac.uk/student-life/accommodation/halls/r

## Scraping for Name, Address, Distance, Price Range, Travel Time and Closest Stations

In [2]:
basic_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        accomodation_address = soup.find('p', class_='heroBanner__address')
        distance = soup.find('div', class_='accommKeyDetails__dist')
        price = soup.find('p', class_='accommKeyDetails__price')

        
        basic_info.append({
            'hyperlink': hyperlink,
            'accomodation_title': accomodation_title.text.strip(),
            'accomodation_address': accomodation_address.text.strip(),
            'distance':distance.text.strip().split(': ')[1],
            'price_range': price.text.strip()
        })

for info in basic_info:
    print("Hyperlink:", info['hyperlink'])
    print("Accomodation Name:", info['accomodation_title'])
    print("Address:", info['accomodation_address'])
    print("Distance to Campus:", info['distance'])
    print("Price Range:", info['price_range'])
    print()

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Accomodation Name: College Hall
Address: College Hall, University of London, Malet Street, London, WC1E 7HZ
Distance to Campus: 1.2km
Price Range: £289-392p/wk

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Accomodation Name: International Hall
Address: International Hall, University of London, Lansdowne Terrace, London, WC1N 1AS
Distance to Campus: 1km
Price Range: £266-321p/wk

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Accomodation Name: Butler's Wharf Residence
Address: Butler's Wharf Residence, 11 Gainsford Street, London, SE1 2NE
Distance to Campus: 3.2km
Price Range: £127-278p/wk

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Accomodation Name: Bankside House
Address: Bankside House, 24 Sumner Street, London, SE1 9JA
Distance to Campus: 1.5km
Pric

In [3]:
travel_time_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        parent_section = soup.find('article', class_="pageContent accommContent")
        all_paragraphs = parent_section.find_all('p')
        target_text_time = "Travel time to campus"
        for paragraph in all_paragraphs:
            if target_text_time in paragraph.text:
                travel_time_info.append({
                    'hyperlink': hyperlink,
                    'travel_time': paragraph.text.strip().split('campus')[1]
                })
for info in travel_time_info:
    print("Hyperlink:", info['hyperlink'])
    print("Travel Time:", info['travel_time'])
    print()

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Travel Time: On foot: 22 mins By bike: 8 minsBy public transport: 16 mins

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Travel Time: On foot: 19 mins By bike: 8 minsBy public transport: 13 mins

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Travel Time: On foot: 51 minsBy bike: 22 minsBy public transport: 34 mins

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Travel Time: On foot: 27 minsBy bike: 13 minsBy public transport: 24 mins

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Travel Time: On foot: 28 mins By bike: 11 minsBy public transport: 23 mins

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx
Travel Time: On foot: 21 minsBy bike: 8 minsBy public transport: 14 m

In [4]:
station_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        parent_section = soup.find('article', class_="pageContent accommContent")
        all_paragraphs = parent_section.find_all('p')
        target_text_station = "Closest London Underground stations"
        for paragraph in all_paragraphs:
            if target_text_station in paragraph.text:
                station_info.append({
                    'hyperlink': hyperlink,
                    'closest_station': paragraph.text.strip().split('stations')[1]
                })
for info in station_info:
    print("Hyperlink:", info['hyperlink'])
    print("Closest Underground Station:", info['closest_station'])
    print()

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Closest Underground Station: Goodge Street, Euston Station, Russell Square and King’s Cross.

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Closest Underground Station: Russell Square, Holborn, Euston and King's Cross.

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Closest Underground Station: London Bridge, Tower Hill, Bermondsey

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Closest Underground Station: London Bridge, Southwark

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Closest Underground Station: Warren Street, Euston

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx
Closest Underground Station: Euston Station, Euston Square, King's Cross and Russell Square.

Hy

## Creating Dataframe 1 for Name, Address, Distance, Price Range, Travel Time and Closest Stations & Basic cleaning on Dataframe 1 before storing it into csv

In [5]:
basic_df = pd.DataFrame(basic_info)
basic_df

Unnamed: 0,hyperlink,accomodation_title,accomodation_address,distance,price_range
0,http://www.lse.ac.uk/student-life/accommodatio...,College Hall,"College Hall, University of London, Malet Stre...",1.2km,£289-392p/wk
1,http://www.lse.ac.uk/student-life/accommodatio...,International Hall,"International Hall, University of London, Lans...",1km,£266-321p/wk
2,http://www.lse.ac.uk/student-life/accommodatio...,Butler's Wharf Residence,"Butler's Wharf Residence, 11 Gainsford Street,...",3.2km,£127-278p/wk
3,http://www.lse.ac.uk/student-life/accommodatio...,Bankside House,"Bankside House, 24 Sumner Street, London, SE1 9JA",1.5km,£167-297p/wk
4,http://www.lse.ac.uk/student-life/accommodatio...,Carr-Saunders Hall,"Carr-Saunders Hall, 18-24 Fitzroy Street, Lond...",1.6km,£173-259p/wk
5,http://www.lse.ac.uk/student-life/accommodatio...,Connaught Hall,"Connaught Hall, University of London, 36-45 Ta...",1.3km,£273-273p/wk
6,http://www.lse.ac.uk/student-life/accommodatio...,High Holborn Residence,"High Holborn Residence, 178 High Holborn, Lond...",0.5km,£184-333p/wk
7,http://www.lse.ac.uk/student-life/accommodatio...,urbanest Westminster Bridge,"urbanest Westminster Bridge, 203 Westminster B...",1.5km,£227-458p/wk
8,http://www.lse.ac.uk/student-life/accommodatio...,Lilian Knowles House,"Lilian Knowles House, 50 Crispin Street, Londo...",2.9km,£198-336p/wk
9,http://www.lse.ac.uk/student-life/accommodatio...,Passfield Hall,"Passfield Hall, 1-7 Endsleigh Place, London, W...",1.5km,£129-287p/wk


In [6]:
travel_time_df = pd.DataFrame(travel_time_info)
travel_time_df

Unnamed: 0,hyperlink,travel_time
0,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 22 mins By bike: 8 minsBy public tran...
1,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 19 mins By bike: 8 minsBy public tran...
2,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 51 minsBy bike: 22 minsBy public tran...
3,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 27 minsBy bike: 13 minsBy public tran...
4,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 28 mins By bike: 11 minsBy public tra...
5,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 21 minsBy bike: 8 minsBy public trans...
6,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 11 minsBy bike: 4 minsBy public trans...
7,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 25 mins By bike: 9 minsBy public tran...
8,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 45 minsBy bike: 19 minsBy public tran...
9,http://www.lse.ac.uk/student-life/accommodatio...,On foot: 23 mins By bike: 10 minsBy public tra...


In [7]:
station_df = pd.DataFrame(station_info)
station_df

Unnamed: 0,hyperlink,closest_station
0,http://www.lse.ac.uk/student-life/accommodatio...,"Goodge Street, Euston Station, Russell Square ..."
1,http://www.lse.ac.uk/student-life/accommodatio...,"Russell Square, Holborn, Euston and King's Cross."
2,http://www.lse.ac.uk/student-life/accommodatio...,"London Bridge, Tower Hill, Bermondsey"
3,http://www.lse.ac.uk/student-life/accommodatio...,"London Bridge, Southwark"
4,http://www.lse.ac.uk/student-life/accommodatio...,"Warren Street, Euston"
5,http://www.lse.ac.uk/student-life/accommodatio...,"Euston Station, Euston Square, King's Cross an..."
6,http://www.lse.ac.uk/student-life/accommodatio...,"Holborn, Covent Garden"
7,http://www.lse.ac.uk/student-life/accommodatio...,"Westminster, Waterloo, Lambeth North"
8,http://www.lse.ac.uk/student-life/accommodatio...,"Liverpool Street, Shoreditch High Street"
9,http://www.lse.ac.uk/student-life/accommodatio...,"Euston Square, Euston"


In [8]:
combined_df = pd.concat([basic_df, travel_time_df, station_df], axis=1)
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
combined_df

Unnamed: 0,hyperlink,accomodation_title,accomodation_address,distance,price_range,travel_time,closest_station
0,http://www.lse.ac.uk/student-life/accommodatio...,College Hall,"College Hall, University of London, Malet Stre...",1.2km,£289-392p/wk,On foot: 22 mins By bike: 8 minsBy public tran...,"Goodge Street, Euston Station, Russell Square ..."
1,http://www.lse.ac.uk/student-life/accommodatio...,International Hall,"International Hall, University of London, Lans...",1km,£266-321p/wk,On foot: 19 mins By bike: 8 minsBy public tran...,"Russell Square, Holborn, Euston and King's Cross."
2,http://www.lse.ac.uk/student-life/accommodatio...,Butler's Wharf Residence,"Butler's Wharf Residence, 11 Gainsford Street,...",3.2km,£127-278p/wk,On foot: 51 minsBy bike: 22 minsBy public tran...,"London Bridge, Tower Hill, Bermondsey"
3,http://www.lse.ac.uk/student-life/accommodatio...,Bankside House,"Bankside House, 24 Sumner Street, London, SE1 9JA",1.5km,£167-297p/wk,On foot: 27 minsBy bike: 13 minsBy public tran...,"London Bridge, Southwark"
4,http://www.lse.ac.uk/student-life/accommodatio...,Carr-Saunders Hall,"Carr-Saunders Hall, 18-24 Fitzroy Street, Lond...",1.6km,£173-259p/wk,On foot: 28 mins By bike: 11 minsBy public tra...,"Warren Street, Euston"
5,http://www.lse.ac.uk/student-life/accommodatio...,Connaught Hall,"Connaught Hall, University of London, 36-45 Ta...",1.3km,£273-273p/wk,On foot: 21 minsBy bike: 8 minsBy public trans...,"Euston Station, Euston Square, King's Cross an..."
6,http://www.lse.ac.uk/student-life/accommodatio...,High Holborn Residence,"High Holborn Residence, 178 High Holborn, Lond...",0.5km,£184-333p/wk,On foot: 11 minsBy bike: 4 minsBy public trans...,"Holborn, Covent Garden"
7,http://www.lse.ac.uk/student-life/accommodatio...,urbanest Westminster Bridge,"urbanest Westminster Bridge, 203 Westminster B...",1.5km,£227-458p/wk,On foot: 25 mins By bike: 9 minsBy public tran...,"Westminster, Waterloo, Lambeth North"
8,http://www.lse.ac.uk/student-life/accommodatio...,Lilian Knowles House,"Lilian Knowles House, 50 Crispin Street, Londo...",2.9km,£198-336p/wk,On foot: 45 minsBy bike: 19 minsBy public tran...,"Liverpool Street, Shoreditch High Street"
9,http://www.lse.ac.uk/student-life/accommodatio...,Passfield Hall,"Passfield Hall, 1-7 Endsleigh Place, London, W...",1.5km,£129-287p/wk,On foot: 23 mins By bike: 10 minsBy public tra...,"Euston Square, Euston"


In [9]:
combined_df[['On Foot(min)', 'By Bike(min)', 'By Public Transport(min)']] = combined_df['travel_time'].str.split('By', expand=True)
combined_df['On Foot(min)'] = combined_df['On Foot(min)'].str.split(': ', expand=True)[1].str.split(' mins', expand=True)[0]
combined_df['By Bike(min)'] = combined_df['By Bike(min)'].str.split(': ', expand=True)[1].str.split(' mins', expand=True)[0]
combined_df['By Public Transport(min)'] = combined_df['By Public Transport(min)'].str.split(': ', expand=True)[1].str.split(' mins', expand=True)[0]
combined_df.drop('travel_time', axis=1, inplace=True)

combined_df['distance'] = combined_df['distance'].str.split('km', expand=True)[0]
combined_df[['Station 1', 'Station 2', 'Station 3']] = combined_df['closest_station'].str.split(', ', expand=True)
combined_df[['Station 3', 'Station 4']] = combined_df['Station 3'].str.split(' and ', expand=True)
combined_df['Station 4'] = combined_df['Station 4'].str.split('.', expand=True)[0]
combined_df.drop('closest_station', axis=1, inplace=True)

combined_df['price_range'] = combined_df['price_range'].str.split('£|p', expand=True)[1]

combined_df['Station 1'] = combined_df['Station 1'].str.replace("King's Cross", "King's Cross/St Pancras")
combined_df['Station 2'] = combined_df['Station 2'].str.replace("King's Cross", "King's Cross/St Pancras")
combined_df['Station 3'] = combined_df['Station 3'].str.replace("King's Cross", "King's Cross/St Pancras")
combined_df['Station 4'] = combined_df['Station 4'].str.replace("King's Cross", "King's Cross/St Pancras")
combined_df['Station 4'] = combined_df['Station 4'].str.replace("King’s Cross", "King's Cross/St Pancras")
combined_df['Station 2'] = combined_df['Station 2'].str.replace("King’s Cross/St Pancras", "King's Cross/St Pancras")

combined_df.drop('hyperlink', axis=1, inplace=True)
combined_df

Unnamed: 0,accomodation_title,accomodation_address,distance,price_range,On Foot(min),By Bike(min),By Public Transport(min),Station 1,Station 2,Station 3,Station 4
0,College Hall,"College Hall, University of London, Malet Stre...",1.2,289-392,22,8,16,Goodge Street,Euston Station,Russell Square,King's Cross/St Pancras
1,International Hall,"International Hall, University of London, Lans...",1.0,266-321,19,8,13,Russell Square,Holborn,Euston,King's Cross/St Pancras
2,Butler's Wharf Residence,"Butler's Wharf Residence, 11 Gainsford Street,...",3.2,127-278,51,22,34,London Bridge,Tower Hill,Bermondsey,
3,Bankside House,"Bankside House, 24 Sumner Street, London, SE1 9JA",1.5,167-297,27,13,24,London Bridge,Southwark,,
4,Carr-Saunders Hall,"Carr-Saunders Hall, 18-24 Fitzroy Street, Lond...",1.6,173-259,28,11,23,Warren Street,Euston,,
5,Connaught Hall,"Connaught Hall, University of London, 36-45 Ta...",1.3,273-273,21,8,14,Euston Station,Euston Square,King's Cross/St Pancras,Russell Square
6,High Holborn Residence,"High Holborn Residence, 178 High Holborn, Lond...",0.5,184-333,11,4,10,Holborn,Covent Garden,,
7,urbanest Westminster Bridge,"urbanest Westminster Bridge, 203 Westminster B...",1.5,227-458,25,9,11,Westminster,Waterloo,Lambeth North,
8,Lilian Knowles House,"Lilian Knowles House, 50 Crispin Street, Londo...",2.9,198-336,45,19,25,Liverpool Street,Shoreditch High Street,,
9,Passfield Hall,"Passfield Hall, 1-7 Endsleigh Place, London, W...",1.5,129-287,23,10,15,Euston Square,Euston,,


In [10]:
combined_df.rename(columns={'accomodation_title': 'Name'}, inplace=True)
combined_df.rename(columns={'accomodation_address': 'Address'}, inplace=True)
combined_df.rename(columns={'distance': 'Distance to Campus(km)'}, inplace=True)
combined_df.rename(columns={'price_range': 'Price Range(£/week)'}, inplace=True)
clean_df = combined_df
pd.set_option('display.max_colwidth', None)
clean_df

Unnamed: 0,Name,Address,Distance to Campus(km),Price Range(£/week),On Foot(min),By Bike(min),By Public Transport(min),Station 1,Station 2,Station 3,Station 4
0,College Hall,"College Hall, University of London, Malet Street, London, WC1E 7HZ",1.2,289-392,22,8,16,Goodge Street,Euston Station,Russell Square,King's Cross/St Pancras
1,International Hall,"International Hall, University of London, Lansdowne Terrace, London, WC1N 1AS",1.0,266-321,19,8,13,Russell Square,Holborn,Euston,King's Cross/St Pancras
2,Butler's Wharf Residence,"Butler's Wharf Residence, 11 Gainsford Street, London, SE1 2NE",3.2,127-278,51,22,34,London Bridge,Tower Hill,Bermondsey,
3,Bankside House,"Bankside House, 24 Sumner Street, London, SE1 9JA",1.5,167-297,27,13,24,London Bridge,Southwark,,
4,Carr-Saunders Hall,"Carr-Saunders Hall, 18-24 Fitzroy Street, London, W1T 4BN",1.6,173-259,28,11,23,Warren Street,Euston,,
5,Connaught Hall,"Connaught Hall, University of London, 36-45 Tavistock Square, London, WC1H 9EX",1.3,273-273,21,8,14,Euston Station,Euston Square,King's Cross/St Pancras,Russell Square
6,High Holborn Residence,"High Holborn Residence, 178 High Holborn, London, WC1V 7AA",0.5,184-333,11,4,10,Holborn,Covent Garden,,
7,urbanest Westminster Bridge,"urbanest Westminster Bridge, 203 Westminster Bridge Road, London, SE1 7FR",1.5,227-458,25,9,11,Westminster,Waterloo,Lambeth North,
8,Lilian Knowles House,"Lilian Knowles House, 50 Crispin Street, London, E1 6HQ",2.9,198-336,45,19,25,Liverpool Street,Shoreditch High Street,,
9,Passfield Hall,"Passfield Hall, 1-7 Endsleigh Place, London, WC1H 0PW",1.5,129-287,23,10,15,Euston Square,Euston,,


In [11]:
clean_df['Distance to Campus(km)'] = pd.to_numeric(clean_df['Distance to Campus(km)'])
clean_df['On Foot(min)'] = pd.to_numeric(clean_df['On Foot(min)'])
clean_df['By Bike(min)'] = pd.to_numeric(clean_df['By Bike(min)'])
clean_df['By Public Transport(min)'] = pd.to_numeric(clean_df['By Public Transport(min)'])
clean_df.dtypes

Name                         object
Address                      object
Distance to Campus(km)      float64
Price Range(£/week)          object
On Foot(min)                  int64
By Bike(min)                  int64
By Public Transport(min)      int64
Station 1                    object
Station 2                    object
Station 3                    object
Station 4                    object
dtype: object

## Storing Dataframe 1 into "distance_data.csv"

In [12]:
clean_df.to_csv('data/distance_data.csv', index=False)

## Scraping for Room Type, Bathroom Type, Specific Prices and Room Size Approximations

In [13]:
room_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        roomtype = soup.find_all('h2', class_='roomlist__title')
        room_texts = [room.text for room in roomtype]
        room_info.append({
            'hyperlink': hyperlink,
            'accomodation_title': accomodation_title.text.strip(),
            'room_type': room_texts
        })
            
for info in room_info:
    print("Hyperlink:", info['hyperlink'])
    print("Name:", info['accomodation_title'])
    print("Room Type:", info['room_type'])

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Name: College Hall
Room Type: ['Single room ', 'Single en suite room ', 'Double en suite room ']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Name: International Hall
Room Type: ['Single room', 'Single studio', 'Double studio']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Name: Butler's Wharf Residence
Room Type: ['Single room', 'Twin room', 'Double room']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Name: Bankside House
Room Type: ['Single room', 'Single en suite room', 'Twin en suite']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Name: Carr-Saunders Hall
Room Type: ['Single room', 'Single with queen bed', 'Twin room', 'Twin en suite room']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/

In [14]:
bathroom_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        bathroom = soup.find_all('p', class_="roomlist__position")
        bathroom_texts = [room.text for room in bathroom]
        bathroom_info.append({
            'hyperlink': hyperlink,
            'bathroom_types': bathroom_texts
        })

result = []
for item in bathroom_info:
    hyperlink = item['hyperlink']
    even_indexed_types = [item['bathroom_types'][i] for i in range(len(item['bathroom_types'])) if i % 2 == 0]
    result.append({'hyperlink': hyperlink, 'even_indexed_types': even_indexed_types})

for info in result:
    print("Hyperlink:", info['hyperlink'])
    print("Bathroom Type:", info['even_indexed_types'])

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Bathroom Type: ['Shared bathroom', 'Shared bathroom', 'Shared bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx
Bathroom Type: ['Shared bathroom', 'Private bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx
Bathroom Type: ['Shared bathroom', 'Shared bathroom', 'Shared bathroom', 'Private bathroom']
Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx
Bathroom Type: ['Shared bathroom']
Hyperlink: http://w

In [15]:
price_info = []
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        room_divs = soup.find_all('div', class_='roomataGlance')
        for room_div in room_divs:
            figure_span = room_div.find(class_='roomataGlance__figure')
            figure_text = figure_span.get_text(strip=True)
            price_info.append({'price': figure_text,
                              'accomodation_title': accomodation_title.text.strip()})
            
for info in price_info:
    print("Name:", info['accomodation_title'])
    print("Price:", info['price'])

Name: College Hall
Price: 289.73p/wk
Name: College Hall
Price: 332.43p/wk
Name: College Hall
Price: 392.63p/wk
Name: International Hall
Price: 266.28p/wk
Name: International Hall
Price: 294.98

-£301.28p/wk
Name: International Hall
Price: 321.93p/wk
Name: Butler's Wharf Residence
Price: 194.60

-£267.40p/wk
Name: Butler's Wharf Residence
Price: 127.40

-£153.65p/wk
Name: Butler's Wharf Residence
Price: 278.95p/wk
Name: Bankside House
Price: 259.70p/wk
Name: Bankside House
Price: 277.20

-£297.15p/wk
Name: Bankside House
Price: 167.65

-£186.20p/wk
Name: Carr-Saunders Hall
Price: 257.25p/wk
Name: Carr-Saunders Hall
Price: 259.35p/wk
Name: Carr-Saunders Hall
Price: 173.25p/wk
Name: Carr-Saunders Hall
Price: 185.50p/wk
Name: Connaught Hall
Price: 273.63p/wk
Name: High Holborn Residence
Price: 317.80p/wk
Name: High Holborn Residence
Price: 333.90p/wk
Name: High Holborn Residence
Price: 333.20p/wk
Name: High Holborn Residence
Price: 184.45p/wk
Name: urbanest Westminster Bridge
Price: 285.23

In [16]:
contract_costs=[]
for hyperlink in all_hyperlinks:  

        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        accomodation_title = soup.find('h1', class_='heroBanner__title')
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            if "Contract cost" in p.get_text():
                contract_cost = p.get_text().split('&pound;')[-1].strip()
                contract_costs.append({'cost': contract_cost,
                                       'accomodation_title': accomodation_title.text.strip()})
for info in contract_costs:
    print("Name:", info['accomodation_title'])
    print("Cost:", info['cost'])

Name: College Hall
Cost: Contract cost: £11,547.81Contract length: 40 weeksSize approx: 12m²
Name: College Hall
Cost: Contract cost: £13,249.71Contract length: 40 weeksSize approx: 14m²
Name: College Hall
Cost: Contract cost: £15,649.11Contract length: 40 weeksSize approx: 17m²
Name: International Hall
Cost: Contract cost: £10,613.16Contract length: 40 weeksSize approx: 8.95m²
Name: International Hall
Cost: Contract cost: £11,757.06 to £12,008.16Contract length: 40 weeksSize approx: 16.4m² to 18.9m²
Name: International Hall
Cost: Contract cost: £12,831.21Contract length: 40 weeksSize approx: 18.9m²
Name: Butler's Wharf Residence
Cost: Contract cost: £9,702.20 to £13,331.80Contract length: 50 weeksSize approx: 10m²
Name: Butler's Wharf Residence
Cost: Contract cost: £6,351.80 to £7,660.55Contract length: 50 weeksSize approx: 14m²
Name: Butler's Wharf Residence
Cost: Contract cost: £13,907.65Contract length: 50 weeksSize approx: 15m²
Name: Bankside House
Cost: Contract cost: £10,091.20Co

## Creating Dataframe 2 for Room Type, Bathroom Type, Specific Prices and Room Size Approximations & Basic cleaning on Dataframe 2 before storing it into csv

In [28]:
room_df = pd.DataFrame(room_info)
room_df_exploded = room_df.explode('room_type').reset_index(drop=True)
room_df_exploded['room_type'] = room_df_exploded['room_type'].str.replace('Single en suite room (Townhouse)', 'Single en suite room')
room_df_exploded['room_type'] = room_df_exploded['room_type'].str.replace('Single with queen bed', 'Single room with queen bed')
room_df_exploded['room_type'][11] = 'Twin en suite room'
room_df_exploded

Unnamed: 0,hyperlink,accomodation_title,room_type
0,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,College Hall,Single room
1,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,College Hall,Single en suite room
2,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,College Hall,Double en suite room
3,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,International Hall,Single room
4,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,International Hall,Single studio
5,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,International Hall,Double studio
6,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Butler's Wharf Residence,Single room
7,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Butler's Wharf Residence,Twin room
8,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Butler's Wharf Residence,Double room
9,http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx,Bankside House,Single room


In [29]:
bathroom_df = pd.DataFrame(result)
bathroom_df_exploded = bathroom_df.explode('even_indexed_types').reset_index(drop=True)
bathroom_df_exploded

Unnamed: 0,hyperlink,even_indexed_types
0,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,Shared bathroom
1,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,Private bathroom
2,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,Private bathroom
3,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,Shared bathroom
4,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,Private bathroom
5,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,Private bathroom
6,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Shared bathroom
7,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Shared bathroom
8,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,Shared bathroom
9,http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx,Shared bathroom


In [30]:
combined_df = pd.merge(room_df_exploded, bathroom_df_exploded, how='left', left_index=True, right_index=True)
combined_df.drop(['hyperlink_x', 'hyperlink_y'], axis=1, inplace=True)
combined_df['even_indexed_types']=combined_df['even_indexed_types'].str.replace(' and kitchen','')
combined_df

Unnamed: 0,accomodation_title,room_type,even_indexed_types
0,College Hall,Single room,Shared bathroom
1,College Hall,Single en suite room,Private bathroom
2,College Hall,Double en suite room,Private bathroom
3,International Hall,Single room,Shared bathroom
4,International Hall,Single studio,Private bathroom
5,International Hall,Double studio,Private bathroom
6,Butler's Wharf Residence,Single room,Shared bathroom
7,Butler's Wharf Residence,Twin room,Shared bathroom
8,Butler's Wharf Residence,Double room,Shared bathroom
9,Bankside House,Single room,Shared bathroom


In [31]:
price_df = pd.DataFrame(price_info)
price_df

Unnamed: 0,price,accomodation_title
0,289.73p/wk,College Hall
1,332.43p/wk,College Hall
2,392.63p/wk,College Hall
3,266.28p/wk,International Hall
4,294.98\n\n-£301.28p/wk,International Hall
5,321.93p/wk,International Hall
6,194.60\n\n-£267.40p/wk,Butler's Wharf Residence
7,127.40\n\n-£153.65p/wk,Butler's Wharf Residence
8,278.95p/wk,Butler's Wharf Residence
9,259.70p/wk,Bankside House


In [32]:
combined_df1=pd.merge(combined_df, price_df, how='left', left_index=True, right_index=True)
combined_df1['price'] = combined_df1['price'].str.replace('p/wk', '')
combined_df1['price'] = combined_df1['price'].str.replace('\n\n', '').str.replace('£', '')
combined_df1 = combined_df1.T.drop_duplicates().T
combined_df1

Unnamed: 0,accomodation_title_x,room_type,even_indexed_types,price
0,College Hall,Single room,Shared bathroom,289.73
1,College Hall,Single en suite room,Private bathroom,332.43
2,College Hall,Double en suite room,Private bathroom,392.63
3,International Hall,Single room,Shared bathroom,266.28
4,International Hall,Single studio,Private bathroom,294.98-301.28
5,International Hall,Double studio,Private bathroom,321.93
6,Butler's Wharf Residence,Single room,Shared bathroom,194.60-267.40
7,Butler's Wharf Residence,Twin room,Shared bathroom,127.40-153.65
8,Butler's Wharf Residence,Double room,Shared bathroom,278.95
9,Bankside House,Single room,Shared bathroom,259.70


In [33]:
contract = pd.DataFrame(contract_costs)
data3 = contract['cost'].str.extract(r'Size approx: (.*)')

contract['Size approx'] = data3[0]

contract['Size approx'] = contract['Size approx'].str.replace('m²', '').str.replace('to', '-').str.replace('m2', '').str.replace('m', '')

contract.drop(columns=['cost'], inplace=True)

contract['Size approx'][45] = '22.5'
contract['Size approx'][42] = '12.1'

new_row = {'accomodation_title': 'Nutford House', 'Size approx': '6.4'}
index_location = 34
contract=pd.concat([contract.iloc[:index_location], pd.DataFrame([new_row]), contract.iloc[index_location:]]).reset_index(drop=True)

contract['Size approx'][35] = '10.5 - 11'
contract['Size approx'][36] = '15.5 - 17.5'
contract['Size approx'][37] = '17.5'

contract['Size approx'][21] = '8.5'
contract['Size approx'][22] = '13.4'
contract['Size approx'][23] = '25.3'

contract

Unnamed: 0,accomodation_title,Size approx
0,College Hall,12
1,College Hall,14
2,College Hall,17
3,International Hall,8.95
4,International Hall,16.4 - 18.9
5,International Hall,18.9
6,Butler's Wharf Residence,10
7,Butler's Wharf Residence,14
8,Butler's Wharf Residence,15
9,Bankside House,12.5


In [34]:
df=pd.merge(combined_df1, contract, how='left', left_index=True, right_index=True)
df = df.T.drop_duplicates().T
df.rename(columns={'accomodation_title_x': 'Name'}, inplace=True)
df.rename(columns={'room_type': 'Room Type'}, inplace=True)
df.rename(columns={'even_indexed_types': 'Bathroom Type'}, inplace=True)
df.rename(columns={'price': 'Price(£/week)'}, inplace=True)
df.rename(columns={'Size approx': 'Size Approximation(m²)'}, inplace=True)
df

Unnamed: 0,Name,Room Type,Bathroom Type,Price(£/week),Size Approximation(m²)
0,College Hall,Single room,Shared bathroom,289.73,12
1,College Hall,Single en suite room,Private bathroom,332.43,14
2,College Hall,Double en suite room,Private bathroom,392.63,17
3,International Hall,Single room,Shared bathroom,266.28,8.95
4,International Hall,Single studio,Private bathroom,294.98-301.28,16.4 - 18.9
5,International Hall,Double studio,Private bathroom,321.93,18.9
6,Butler's Wharf Residence,Single room,Shared bathroom,194.60-267.40,10
7,Butler's Wharf Residence,Twin room,Shared bathroom,127.40-153.65,14
8,Butler's Wharf Residence,Double room,Shared bathroom,278.95,15
9,Bankside House,Single room,Shared bathroom,259.70,12.5


## Storing Dataframe 2 into "contract_data.csv"

In [35]:
df.to_csv('data/contract_data.csv', index=False)

## Scraping for Total Bed Spaces and Facilities

In [25]:
from bs4 import BeautifulSoup
import requests

def extract_info(soup):
    room_types_title = soup.find('h3', class_='ataGlance__title--types')
    room_types_list = soup.find('ul', class_='ataGlance__list')
    room_types_items = room_types_list.find_all('li', class_='ataGlance__item')

    facilities_title = soup.find('h3', class_='ataGlance__title')
    facilities_list = soup.find_all('ul', class_='ataGlance__list')[1]  # Get the second ul tag
    facilities_items = facilities_list.find_all('li', class_='ataGlance__item')

    room_types_info = [(item.find('span', class_='ataGlance__name').text.strip(), item.find('span', class_='ataGlance__qty').text.strip()) for item in room_types_items]
    facilities_info = [item.find('span', class_='ataGlance__name').text.strip() for item in facilities_items]

    return room_types_info, facilities_info

accommodation_info = []

for hyperlink in all_hyperlinks:  

    try:
        response = requests.get(hyperlink)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        room_types, facilities = extract_info(soup)
        accommodation_info.append({
            'hyperlink': hyperlink,
            'room_types': room_types,
            'facilities': facilities
        })

    except Exception as e:
        print("An error occurred while processing", hyperlink, ":", e)

for info in accommodation_info:
    print("Hyperlink:", info['hyperlink'])
    print("Room Types:")
    for room_type, quantity in info['room_types']:
        print(room_type, quantity)
    print("Facilities:")
    for facility in info['facilities']:
        print(facility)
    print()

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx
Room Types:
Bed spaces in total 28
Double en suite 4
Single 8
Single en suite 16
Facilities:
24-hour staff cover
Accessible rooms
Bicycle storage
Catered
Common room
Communal TV
Lift access
Non-smoking
Secure entrance
Self-service laundry
WiFi

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx
Room Types:
Bed spaces in total 106
Double studio 15
Single 74
Single studio 17
Facilities:
24-hour staff cover
Accessible rooms
Bicycle storage
Catered
Common room
Communal TV
Non-smoking
Projector/Cinema room
Quiet study space
Secure entrance
Self-catered
Self-service laundry
WiFi

Hyperlink: http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx
Room Types:
Bed spaces in total 280
Double 2
Single 256
Twin (shared) 22
Facilities:
24-hour staff cover
Bicycle storage
Common room
Communal TV
Computer room
Lift access
Non-smoking
Printi

## Creating Dataframe 3 for Total Bed Spaces and Facilities & Basic cleaning on Dataframe 3 before storing it into csv

In [26]:
df = pd.DataFrame(accommodation_info)
df['Room Types'] = df['room_types'].apply(lambda x: ', '.join([room[0] for room in x if room[0] != 'Bed spaces in total']))
df['Total Bed Spaces'] = df['room_types'].apply(lambda x: [int(room[1]) for room in x if room[0] == 'Bed spaces in total'][0])
df.drop(['room_types'], axis=1, inplace=True)
df.drop(['Room Types'], axis=1, inplace=True)


unique_facilities = set(facility for sublist in df['facilities'] for facility in sublist)

for facility in unique_facilities:
    df[facility] = df['facilities'].apply(lambda x: 1 if facility in x else 0)
df.drop('facilities', axis=1, inplace=True)
df.rename(columns={'hyperlink': 'Hyperlink'}, inplace=True)
df

Unnamed: 0,Hyperlink,Total Bed Spaces,Computer room,Secure entrance,Non-smoking,24-hour staff cover,Self-service laundry,Printing facilities,Bicycle storage,Accessible rooms,Communal TV,Catered,Lift access,Quiet study space,WiFi,Self-catered,Car parking,Projector/Cinema room,Common room
0,http://www.lse.ac.uk/student-life/accommodation/halls/college-hall/home.aspx,28,0,1,1,1,1,0,1,1,1,1,1,0,1,0,0,0,1
1,http://www.lse.ac.uk/student-life/accommodation/halls/international-hall/home.aspx,106,0,1,1,1,1,0,1,1,1,1,0,1,1,1,0,1,1
2,http://www.lse.ac.uk/student-life/accommodation/halls/butlers-wharf-residence/home.aspx,280,1,1,1,1,1,1,1,0,1,0,1,0,1,1,0,0,1
3,http://www.lse.ac.uk/student-life/accommodation/halls/bankside-house/home.aspx,595,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1
4,http://www.lse.ac.uk/student-life/accommodation/halls/carr-saunders-hall/home.aspx,155,1,1,1,1,1,1,1,0,1,1,1,0,1,0,0,1,1
5,http://www.lse.ac.uk/student-life/accommodation/halls/connaught-hall/home.aspx,26,0,1,1,1,1,0,1,0,1,1,1,1,1,0,0,0,1
6,http://www.lse.ac.uk/student-life/accommodation/halls/high-holborn-residence/home.aspx,446,1,1,1,1,1,1,0,1,1,0,1,0,1,1,0,0,1
7,http://www.lse.ac.uk/student-life/accommodation/halls/urbanest-westminster-bridge/home.aspx,669,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1
8,http://www.lse.ac.uk/student-life/accommodation/halls/lilian-knowles-house/home.aspx,365,1,1,1,1,1,1,1,0,1,0,1,0,1,1,0,0,1
9,http://www.lse.ac.uk/student-life/accommodation/halls/passfield-hall/home.aspx,227,1,1,1,1,1,1,1,0,1,1,1,0,1,0,0,0,1


## Storing Dataframe 3 into "accommodation_info.csv"

In [27]:
df.to_csv('data/accommodation_info.csv', index=False)