**Web scraping template**

In [None]:
# libaries
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import requests
import re
from re import sub
from decimal import Decimal
import io
from datetime import datetime


# functions to clean data
def to_num(price):
    '''
    1. Input price-formatted values (e.g. £ 10,000,000)
    2. Normalize value to plain numeric value (e.g. 10000000)
    '''
    value = Decimal(sub(r'[^\d.]', '', price))
    return float(value)

def is_skipped(price):
    '''
    1. Detect price labels that are not actual prices
       (e.g. "POA")
    2. Return false if no price value is applicable
    '''
    for i in range(len(price)):
        if(price[i] != '£' and price[i] != ','
           and (not price[i].isdigit())):
              return True
    return False


# base link to scrape ads across London
url = '{{ENTER_WEBSITE_URL_OF_SEARCH_REQUEST_HERE}}'

map = {}
id = 0

# define how many pages to scrape for "London"
max_pages = 4

start = time.time()

for p in range(max_pages):
    
    
    cur_url = url + str(p + 1)

    print("Scraping page: %d" % (p + 1))

    html_text = requests.get(cur_url).text
    soup = BeautifulSoup(html_text, 'lxml')

    ads = soup.find_all('div', class_ = '{{HTML CLASS TO TARGET AD}}')
    #print(len(ads))
    
    # comment this logic
    page_nav = soup.find_all('a', class_ = '{{HTML CLASS TO TARGET LINK IN NEXT BUTTON}}')

    if(len(page_nav) == 0):
        print("max page number: %d" % (p))
        end = time.time()
        print(end - start)
        break

    for k in range(len(ads)):
        
        ad = ads[k]
        
        id += 1
        map[id] = {}

        #find section for address
        address = ad.find('p', class_ = '{{HTML CLASS TO TARGET ADDRESS}}').text

        #find price information
        price = ad.find('p', class_ = 'css-6v9gpl-Text eczcs4p0').text

        # drop if price section does not contain a real price value
        if(is_skipped(price)): continue

        #find public transport information
        transport_section = ad.find('div', class_ = '{{HTML CLASS}}')
        transport_type = ad.find_all('span', class_ = '{{HTML CLASS}}')
        transport_information = transport_section.find_all('p', class_ = '{{HTML CLASS}}')

        #assign address
        map[id]["address"] = address     

        #assign price
        map[id]["price"] = to_num(price)

        # create dicts for public transport information
        map[id]["distance"] = []
        map[id]["station"] = []
        map[id]["transport_type"] = []

        for i in range(len(transport_information)):
            s = transport_information[i].text
            x = s.split(' miles ')
            map[id]["distance"].append(float(x[0]))
            map[id]["station"].append(x[1])
            map[id]["transport_type"].append(transport_type[i]['testid'])
        

print("Scraping task finished")
end = time.time()
print(str(round(end - start, 2)) + 'sec')

**Transform dictionary into list of lists**

In [None]:
# transform to dict to list
result = []
cur_row = 0

for id in map.keys():
    cur_price = map[cur_id]["price"]
    cur_address = map[cur_id]["address"]
    for idx in range(len(map[id]["distance"])):
        result.append([])
        result[cur_row].append(int(cur_id))
        result[cur_row].append(float(cur_price))
        result[cur_row].append(str(cur_address))
        result[cur_row].append(float(map[id]["distance"][idx]))
        result[cur_row].append(str(map[id]["station"][idx]))
        result[cur_row].append(str(map[id]["transport_type"][idx]))

        cur_row += 1

**Transform to DF**

In [None]:
df = pd.DataFrame(result, columns = ["ad_id", "price","address", "distance",
                                     "station", "transport_type"])

**Export as CSV**

In [None]:
filename = 'test.csv'
df.to_csv(filename)