## 0. Import Necessary Packages

In [None]:
import pandas as pd
import numpy as np
import requests as r
import time
import random
import copy
import pgeocode
import pickle
import spacy
import csv
import json
from datetime import datetime
from bs4 import BeautifulSoup as soup

## 1. Set the Landing Page for Scraping

In [None]:
# Landing page (p1) - Restaurants with "Establishment Type" = Coffee & Tea in London
url_land = 'https://www.tripadvisor.co.uk/Restaurants-g186338-zfg9900-London_England.html'
url_root = 'https://www.tripadvisor.co.uk'

## 2. Create a dictionary of London Cafes
### 2. 1 Gather information from the search results for London Cafes

In [None]:
# Create empty dict for cafe info
cafedic = {}

# Set starting page number
i = 1

# Limit of 100 pages of cafes
while i <= 100:

    # For the landing page
    if i == 1:
        page = url_land
    else:
        pass

    # Collect page content
    response = r.get(page, headers={'User-Agent': "Mozilla/5.0"})
    content = soup(response.content, 'lxml')

    # Get cafe list
    content_cafe = content.find('div', {'class': 'YtrWs'})

    # Create restaurant dict
    temp_i = 0
    for t in content_cafe.find_all('a'):
        # Extract the text
        txt = t.text
        # Get info on names and # of reviews
        excl = ['“', "Reserve", "Order"]
        if len(txt) > 0 and not any(e in txt for e in excl):
            txt = txt.split(" ",1)
            txt[0] = txt[0].replace(".","").replace(",","")
            if 'review' not in txt[1]:
                txt[0] = int(txt[0])
                # Create dict with rank as id
                cafedic.setdefault(txt[0], {})
                # Create nested dict and add name and cafe link
                cafedic[txt[0]]['name'] = txt[1]
                cafedic[txt[0]]['cafeurl'] = url_root + t['href']
                temp_i = txt[0]
            else:
                # Add # of reviews to nested dict (0 if no reviews)
                try:
                    cafedic[temp_i]['reviewcnt'] = int(txt[0])
                except:
                    cafedic[temp_i]['reviewcnt'] = 0

    # Update page link to next page
    try:
        i += 1
        page = url_root + content.find('a', {'data-page-number' : i})['href']
        # To avoid being blocked for scraping
        time.sleep(random.randint(1, 3))

    # Until we have gone through every page
    except:
        break

### 2.2 Gather information from each of the Cafes' Pages

In [None]:
cafeids = copy.deepcopy(list(cafedic.keys()))
reviewcats = ['Excellent', 'Very good', 'Average', 'Poor', 'Terrible']

for i in cafeids:
    print(i)
    url =  cafedic[i]['cafeurl']
    
    # Collect review page content
    response = r.get(url, headers={'User-Agent': "Mozilla/5.0"})
    content = soup(response.content,'lxml')
    
    # Add cafe rating
    try:
        cafedic[i]['rating'] = float(content.find('h2').find_next('span').text)
    except:
        cafedic[i]['rating'] = np.nan
        
    # Add review counts per category
    try:
        catcnts = [int(i.text.replace(',','')) for i in content.find_all('span', {'class': 'row_num is-shown-at-tablet'})]
        for cat in range(len(reviewcats)):
            attrname = 'rev_'+reviewcats[cat]
            cafedic[i][attrname] = catcnts[cat] 

        # Add the category with the most reviews
        topcat = reviewcats[catcnts.index(max(catcnts))]
        cafedic[i]['topcat'] = topcat
    except:
        pass
        
    # Add price range
    try:
        if '£' not in content.find('a', {'class': 'dlMOJ'}).text:
            raise AttributeError
        else:
            cafedic[i]['pricerng'] = content.find('a', {'class': 'dlMOJ'}).text
            
    except AttributeError:
        cafedic[i]['pricerng'] = np.nan
    
    # Add cafe address
    try:
        fullad = content.find(text = 'Location and contact').find_next('span', {'class' : 'yEWoV'}).get_text().split()
        postcode = ' '.join(fullad[-3:-1])
        if len(postcode) > 8:
            raise AttributeError
        else:
            cafedic[i]['postcode'] = postcode
    except AttributeError:
        cafedic[i]['postcode'] = np.nan
        
    # Add borough
    try:
        pc = postcode.replace(' ','')
        url = f'http://api.postcodes.io/postcodes/{pc}'
        borough_content = r.get(url)
        cafedic[i]['borough'] = json.loads(borough_content.text)['result']['primary_care_trust']
    except:
            cafedic[i]['borough'] = np.nan
    
    # Add latitude
    nomi = pgeocode.Nominatim('GB', 'fr')
    try:
        cafedic[i]['lat'] = nomi.query_postal_code(f'{postcode}')[9]
    except ValueError:
        cafedic[i]['lat'] = np.nan

    # Add longitude
    try:
        cafedic[i]['long'] = nomi.query_postal_code(f'{postcode}')[10]
    except ValueError:
        cafedic[i]['long'] = np.nan       
    
    # To avoid being blocked for scraping
    time.sleep(random.randint(1, 3))

# Write cafe dictionary to binary file
file = open("cafedict", "wb")
pickle.dump(cafedic, file)
file.close()
    

In [None]:
# (Optional) For reading pre-formed cafe data

cafedic = pd.read_pickle(r'cafedict')

## 3. Create a table with review data

In [None]:
# (Optional) Reset review list
revlst = []

In [None]:
# Make review list

cafeids = copy.deepcopy(list(cafedic.keys()))

with open("reviews.csv", "w") as csvfile:
    writer = csv.writer(csvfile)

    for c in cafeids:

        url =  cafedic[c]['cafeurl']

        # Collect review page content
        response = r.get(url, headers={'User-Agent': "Mozilla/5.0"})
        content = soup(response.content,'lxml') 


        # Collect details for 15 most recent reviews
    
        # Review
        try:
            revinfo = content.find_all('div', {'class' : 'ui_column is-9'})
            # Date
            date = content.find_all('span', {'class' : 'ratingDate'})
            
            try:
                revlst = []
                for rev in revinfo:
                    revtxt = rev.find('p', {'class' : 'partial_entry'})
                    if revtxt:
                        revlst.append(revtxt.text)
            except:
                print("no revs")
                pass
            
            # Rating
            try:
                ratlst = []
                for rat in revinfo:
                    ratscore = rat.find('span', {'class': 'ui_bubble_rating'})
                    if ratscore:
                        ratlst.append(int(ratscore['class'][1][-2:])/10)
            except:
                print("no ratings")
                pass

            # Add details to list / csv
            for rev in range(len(date)):
                rev_details = []
                
                # Add the cafe id
                rev_details.append(c)
                
                # Add date
                rev_details.append(pd.to_datetime(date[rev]['title']).date())

                # Review rating
                try:
                    rev_details.append(ratlst[rev])
                except:
                    print("no rating")
                    rev_details.append(np.nan)
                    pass
                
                # Review content
                try:
                    rev_details.append(revlst[rev])
                except:
                    print("no text")
                    rev_details.append(np.nan)
                    pass
                
                # Write to csv / add to list
                writer.writerow(rev_details)
                revlst.append(rev_details)
                
        except:
            print("No reviews found", c)
            pass
        
        print("done:",c)
        # To avoid being blocked for scraping
        time.sleep(random.randint(1, 3))