# Scraping British Airways Reviews from Skytrax 

---

## Objectives

Understanding customer insights, needs, and feedback is crucial to increase quality of hospitality and service of BA. Here, before performing the analysis, i'll collect data from the third-party, Skytrax, about customer feedback, and gain customer insights about BA's Service.

In [78]:
## import required libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin

In [103]:
# config

base_url = "https://www.airlinequality.com/airline-reviews/british-airways/"

page_size = 300

req_delay = 2

output_file = "D:/Create/FORAGE [BA]/ba_reviews_adv.csv"

In [101]:
# def function to scrape the website

## scraping single review article

def scrape_single_review(article):
    review_data = {}

    review_data['title'] = article.find('h2', class_='text_header').get_text(strip=True)

    # author info

    author_info = article.find('h3', class_='text_sub_header')
    review_data['author'] = author_info.find('span', itemprop='name').get_text(strip=True)
    review_data['country'] = author_info.find_all(string=True)[3].strip('  ()')
    review_data['date'] = article.find('time')['datetime']

    # floown verified
    verified_tag = article.find('strong')
    review_data['verified'] = 'Trip Verified' in verified_tag.get_text() if verified_tag else False

    # rating out of 10 
    review_data['ratings'] = article.find('span', itemprop='ratingValue').get_text(strip=True)

    # review box
    review_box = article.find('div', class_='tc_mobile')
    review_data['review_text'] = review_box.find_all(string=True)[3] 

    # table stats
    rating_table = article.find('table', class_= "review-ratings")
    if rating_table:
        rows = rating_table.find_all('tr')
        for row in rows:
            header = row.find('td', class_= "review-rating-header")
            if header:
                # striping the header, so we got clear text
                header_text = (header.get_text(strip=True)
                               .lower()
                               .replace(' ','_')
                               .replace('&','and')
                               .replace('/','_'))
                
                value_cell = header.find_next_sibling('td')

                # star rating value

                if 'stars' in value_cell.get('class', []):
                    stars = len(value_cell.find_all('span', class_='star fill'))
                    review_data[header_text] = stars
                else:
                    review_data[header_text] = value_cell.get_text(strip=True)
    
    return review_data

## scraping a page

def scrape_page(url):
    try:
        
        response = requests.get(url)

        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        reviews = []

        review_articles = soup.find_all("article", itemprop="review")

        for article in review_articles:
            reviews.append(scrape_single_review(article))

        return reviews
    
    except Exception as e:
        print(f"error scraping page {url}: {str()}")
        return[]

## scraping multiple page

def scrape_multiple_pages(base_url, page_size):
    all_reviews = []

    for page_num in range(1, page_size + 1):
        print(f"Scraping page {page_num}/{page_size}...", end='\r')

        if page_num == 1:
            page_url = base_url
        else:
            page_url = f"{base_url}page/{page_num}/"

        page_reviews = scrape_page(page_url)
        all_reviews.extend(page_reviews)

        time.sleep(req_delay)
    
    print(f"\nScraped {len(all_reviews)} reviews in total")
    return all_reviews



In [104]:
# main scraping process

print(f"Starting to scrape up to {page_size} pages from {base_url}")
reviews = scrape_multiple_pages(base_url, page_size)
print(f"Scraped {len(reviews)} reviews in total")

Starting to scrape up to 300 pages from https://www.airlinequality.com/airline-reviews/british-airways/
Scraping page 300/300...
Scraped 3000 reviews in total
Scraped 3000 reviews in total


In [105]:
# save the data

df = pd.DataFrame(reviews)

df.to_csv(output_file, index=False)
print(f"Saved {len(df)} reviews to {output_file}")

Saved 3000 reviews to D:/Create/FORAGE [BA]/ba_reviews_adv.csv
