# Importing necessary packages and creating parent links for TripAdvisor using a simple loop

In [12]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import time
import pandas as pd
from datetime import datetime, timedelta
import re
import random

links = []
for page in range(0,301,30):
    url =f'https://www.tripadvisor.dk/Restaurants-g189541-oa{page}-Copenhagen_Zealand.html'
    links.append(url)

links

['https://www.tripadvisor.dk/Restaurants-g189541-oa0-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa30-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa60-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa90-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa120-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa150-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa180-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa210-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa240-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa270-Copenhagen_Zealand.html',
 'https://www.tripadvisor.dk/Restaurants-g189541-oa300-Copenhagen_Zealand.html']

# Collecting individual links for restaurants

In [13]:
# A set to keep track of unique URLs we've already collected
unique_urls = set()

# A list to store all unique restaurant URLs while preserving order
restaurant_urls = []

# Loop through each page link
for link in tqdm(links):
    # Request the page
    headers = {
        'User-Agent': 'Jeppe Vanderhaegen, Data Science Student at University of Copehagen',
        'Email': 'wlr139@alumni.ku.dk'
    }

    time.sleep(1)
    
    response = requests.get(link, headers=headers)
    
    # Parse the page content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all the restaurant link elements
    restaurant_elements = soup.find_all('a', class_='BMQDV _F Gv wSSLS SwZTJ FGwzt ukgoS')
    
    # Extract the URLs and append to the list if they are unique
    for element in restaurant_elements:
        # The 'href' attribute contains the relative URL; you need to prepend the base URL
        relative_url = element.get('href')
        if relative_url and relative_url.startswith("/Restaurant_Review"):
            full_url = f"https://www.tripadvisor.dk{relative_url}"
            # Check if the URL is unique before adding it to the list
            if full_url not in unique_urls:
                restaurant_urls.append(full_url)
                unique_urls.add(full_url)

# Output the list of restaurant URLs
for url in restaurant_urls:
    print(url)

print(f"Total unique URLs collected: {len(restaurant_urls)}")

100%|██████████| 11/11 [00:22<00:00,  2.02s/it]

https://www.tripadvisor.dk/Restaurant_Review-g189541-d1437917-Reviews-Magstraede_16-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d807089-Reviews-El_Meson-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d21382008-Reviews-C_ho_Fame-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d14098007-Reviews-Ark-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d23520766-Reviews-Pomodoro_D_oro-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d23133923-Reviews-Bistro_Lupa-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d19608198-Reviews-Maple_Casual_Dining-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d23651858-Reviews-Hooked_Christianshavn-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restaurant_Review-g189541-d8547060-Reviews-Alchemist-Copenhagen_Zealand.html
https://www.tripadvisor.dk/Restau




# Converting restaurant links to a DataFrame

In [15]:
df_restaurant_links = pd.DataFrame(restaurant_urls)

# Making a CSV file
df_restaurant_links.to_csv('restaurants_filtered', index=False)

# Display the filtered DataFrame
df_restaurant_links

Unnamed: 0,0
0,https://www.tripadvisor.dk/Restaurant_Review-g...
1,https://www.tripadvisor.dk/Restaurant_Review-g...
2,https://www.tripadvisor.dk/Restaurant_Review-g...
3,https://www.tripadvisor.dk/Restaurant_Review-g...
4,https://www.tripadvisor.dk/Restaurant_Review-g...
...,...
290,https://www.tripadvisor.dk/Restaurant_Review-g...
291,https://www.tripadvisor.dk/Restaurant_Review-g...
292,https://www.tripadvisor.dk/Restaurant_Review-g...
293,https://www.tripadvisor.dk/Restaurant_Review-g...


# Web scrapping ratings and dates for the first 15 reviews on all restaurants

In [16]:
# Convert the DataFrame of links to a list
restaurant_links_list = df_restaurant_links[0].tolist()

# Create empty lists to store ratings and dates
ratings = []
dates = []

# Loop through each page link
for link in tqdm(restaurant_links_list):
    # Request the page
    headers = {
        'User-Agent': 'Jeppe Vanderhaegen, Data Science Student at University of Copehagen',
        'Email': 'wlr139@alumni.ku.dk'
    }
    
    time.sleep(random.uniform(2, 4))  # Add a delay to avoid getting blocked
    
    response = requests.get(link, headers=headers)

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Scrape ratings
    target_divs = soup.find_all('svg', class_='UctUV d H0', attrs={'height': '16'})
    target_divs = target_divs[2:]  # Skip the first 2 irrelevant divs

    # Extract ratings from the target divs
    temp_ratings = []
    for div in target_divs:
        titles = div.find_all('title', string=re.compile(r'\d,\d ud af 5 bobler'))
        for title in titles:
            rating_text = title.get_text()
            numeric_rating = rating_text.split(",")[0]  # Get the part before the comma
            temp_ratings.append(int(numeric_rating))  # Convert to integer and store in a temporary list

    # Scrape dates
    temp_dates = []
    date_divs = soup.find_all('div', class_='biGQs _P pZUbB ncFvv osNWb')
    for div in date_divs:
        full_text = div.get_text(separator=' ', strip=True)  # Join lines and remove excess spaces
        match = re.search(r'\d{1,2}\. \w+ \d{4}', full_text)
        if match:
            temp_dates.append(match.group())  # Store the cleaned date in a temporary list

    # Only append pairs of ratings and dates where both exist
    for rating, date in zip(temp_ratings, temp_dates):
        ratings.append(rating)
        dates.append(date)

# Replace Danish month names with English ones for datetime parsing
df_dates = pd.DataFrame(dates, columns=['Date'])
df_dates['Date'] = df_dates['Date'].replace({
    'januar': 'January',
    'februar': 'February',
    'marts': 'March',
    'april': 'April',
    'maj': 'May',
    'juni': 'June',
    'juli': 'July',
    'august': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'december': 'December'
}, regex=True)

# Convert the date to datetime format
df_dates['Date'] = pd.to_datetime(df_dates['Date'], format='%d. %B %Y', errors='coerce')

# Convert the date to the desired format (dd-mm-yyyy)
df_dates['Date'] = df_dates['Date'].dt.strftime('%d-%m-%Y')

# Combine the ratings and dates into a single DataFrame
df_combined = pd.DataFrame({
    'Rating': ratings,
    'Date': df_dates['Date']
})

# Display the cleaned DataFrame
df_combined

100%|██████████| 295/295 [26:06<00:00,  5.31s/it]


Unnamed: 0,Rating,Date
0,2,02-04-2024
1,5,16-12-2023
2,5,04-03-2023
3,3,19-10-2022
4,3,20-02-2022
...,...,...
4257,5,02-08-2020
4258,5,24-07-2020
4259,5,03-07-2020
4260,5,28-02-2020


# Handling and processing the gathered data using Pandas

In [24]:
# Convert the 'Date' column to datetime format if it's not already in datetime format
df_combined['Date'] = pd.to_datetime(df_combined['Date'], format='%Y-%m-%d', errors='coerce')

# Filter the DataFrame to keep only observations from 2021-01-01 onwards
df_filtered = df_combined[df_combined['Date'] >= '2021-01-01']

# Display the filtered DataFrame
df_filtered 

Unnamed: 0,Rating,Date
0,2,2024-04-02
1,5,2023-12-16
2,5,2023-03-04
3,3,2022-10-19
4,3,2022-02-20
...,...,...
4251,5,2023-04-29
4252,5,2023-01-08
4253,5,2022-02-13
4254,5,2022-02-05


In [27]:
# Making a CSV file
df_filtered.to_csv('ratings_final', index=False)