In [164]:
import requests
from bs4 import BeautifulSoup as soup
import numpy as np
import pandas as pd
import math

In [209]:
# Function to get the number of pages of an specific borough
def number_of_search_pages(url):
    page = requests.get(url)
    if page.status_code != 200:
        print('Url provided not valid')
    else:
        bsobj = soup(page.content, 'html5lib')
        pages_html = bsobj.find('p',{'data-testid':'total-results'})
        pages = pages_html.get_text().split()
        house_num = int(pages[0])
        pages = math.ceil(int(pages[0])/25)
        return pd.Series((pages, house_num))

In [191]:
# Function to build the url for each borough
def get_borough_url(borough):
    boruough_split = borough.split()
    if len(boruough_split) == 1:
        url = 'https://www.zoopla.co.uk/for-sale/property/' + borough + '-london-borough/?page_size=25&q=' \
        + borough.lower() + '&radius=0&results_sort=newest_listings&pn=1'
    else:
        for i in range(0,len(boruough_split)):
            if i == 0:
                item_1 = boruough_split[i]
                item_2 = boruough_split[i].lower()
            else:
                item_1 = item_1 + '-' + boruough_split[i]
                item_2 = item_2 + '%20' + boruough_split[i].lower()
        url = 'https://www.zoopla.co.uk/for-sale/property/' + item_1 + '-london-borough/?page_size=25&q=' \
        + item_2 + '&radius=0&results_sort=newest_listings&pn=1'
    return url

In [222]:
# Function to scrape links and other details from main search page of each house
def get_main_house_details(url_initial, pages, house_num):
    url_initial = url_initial[:-1]
    link = []
    price = []
    listed = []
    for i in range(1,pages+1):
        url = url_initial + str(i)
        page = requests.get(url)
        bsobj = soup(page.content, 'html5lib')
        for j in bsobj.findAll('a',{'data-testid':'listing-details-link'}, href=True):
            link.append(j.get('href'))
        for j in bsobj.findAll('div',{'class':'css-1e28vvi-PriceContainer e2uk8e8'}):
            price.append(j.find('p',{'class':'css-6v9gpl-Text eczcs4p0'}).text)
        for j in bsobj.findAll('span',{'data-testid':'date-published'}):
            listed.append(j.text)
    return link, price, listed

In [192]:
# List of London Boroughs
london_borough = ['Camden','Greenwich','Hackney','Hammersmith','Islington','Kensington and Chelsea',
                 'Lambeth','Lewisham','Southwark','Tower Hamlets','Wandsworth','Westminster','Barking',
                 'Barnet','Bexley','Brent','Bromley','Croydon','Ealing','Enfield','Haringey','Harrow',
                 'Havering','Hillingdon','Hounslow','Kingston upon Thames','Merton','Newham','Redbridge',
                 'Richmond upon Thames','Sutton','Waltham Forest']

In [210]:
df_boroughs = pd.DataFrame(london_borough)
df_boroughs.columns = ['borough_name']
df_boroughs['borough_url'] = df_boroughs['borough_name'].apply(lambda x: get_borough_url(x))
df_boroughs[['pages','house_num']] = df_boroughs['borough_url'].apply(lambda x: number_of_search_pages(x))
df_boroughs

Unnamed: 0,borough_name,borough_url,pages,house_num
0,Camden,https://www.zoopla.co.uk/for-sale/property/Cam...,115,2869
1,Greenwich,https://www.zoopla.co.uk/for-sale/property/Gre...,23,574
2,Hackney,https://www.zoopla.co.uk/for-sale/property/Hac...,77,1921
3,Hammersmith,https://www.zoopla.co.uk/for-sale/property/Ham...,40,978
4,Islington,https://www.zoopla.co.uk/for-sale/property/Isl...,106,2631
5,Kensington and Chelsea,https://www.zoopla.co.uk/for-sale/property/Ken...,157,3915
6,Lambeth,https://www.zoopla.co.uk/for-sale/property/Lam...,137,3408
7,Lewisham,https://www.zoopla.co.uk/for-sale/property/Lew...,81,2015
8,Southwark,https://www.zoopla.co.uk/for-sale/property/Sou...,105,2613
9,Tower Hamlets,https://www.zoopla.co.uk/for-sale/property/Tow...,167,4171


In [223]:
Camden_links, Camden_price, Camdem_listed = get_main_house_details(df_boroughs['borough_url'][0],df_boroughs['pages'][0],df_boroughs['house_num'][0])

In [230]:
Camden = pd.DataFrame({'Link':Camden_links,'Price':Camden_price,'Listed':Camdem_listed},
                     columns = ['Link','Price','Listed'])
Camden['Brough'] = 'Camden'
Camden

Unnamed: 0,Link,Price,Listed,Brough
0,/for-sale/details/55756717/,"£445,000",Listed on 21st Jun 2021,Camden
1,/for-sale/details/58963984/,"£525,000",Listed on 21st Jun 2021,Camden
2,/for-sale/details/58963688/,"£1,275,000",Listed on 21st Jun 2021,Camden
3,/for-sale/details/58963510/,"£555,000",Listed on 21st Jun 2021,Camden
4,/for-sale/details/58962200/,"£555,000",Listed on 21st Jun 2021,Camden
...,...,...,...,...
2864,/for-sale/details/40915563/,"£1,095,000",Listed on 3rd Jul 2016,Camden
2865,/new-homes/details/36669950/,"£1,500,000",Listed on 27th May 2016,Camden
2866,/for-sale/details/55400494/,"£950,000",Listed on 24th May 2016,Camden
2867,/for-sale/details/39748209/,"£731,000",Listed on 9th Mar 2016,Camden


In [220]:
url = 'https://www.zoopla.co.uk/for-sale/property/Camden-london-borough/?page_size=25&q=camden&radius=0&results_sort=newest_listings&pn=1'
page = requests.get(url)
bsobj = soup(page.content, 'html5lib')
prueba = []
for j in bsobj.findAll('span',{'data-testid':'date-published'}):
    prueba.append(j.text)
prueba

['Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 21st Jun 2021',
 'Listed on 20th Jun 2021',
 'Listed on 20th Jun 2021',
 'Listed on 19th Jun 2021',
 'Listed on 19th Jun 2021']

In [216]:
df_boroughs['borough_url'][0]

'https://www.zoopla.co.uk/for-sale/property/Camden-london-borough/?page_size=25&q=camden&radius=0&results_sort=newest_listings&pn=1'

In [139]:
price = []
for i in bsobj.findAll('div',{'class':'css-wfndrn-StyledContent e2uk8e18'}):
    for j in i.findAll('a', {'class': 'e2uk8e4 css-15tydk8-StyledLink-Link-FullCardLink e33dvwd0'}, href=True):
        price.append(j.get('href'))
    
len(price)

25