In [None]:
!pip install html5-parser

Collecting html5-parser
  Downloading html5-parser-0.4.12.tar.gz (270 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/270.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m245.8/270.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: html5-parser
  Building wheel for html5-parser (setup.py) ... [?25l[?25hdone
  Created wheel for html5-parser: filename=html5_parser-0.4.12-cp310-cp310-linux_x86_64.whl size=465647 sha256=a16e7dc4542ea750b4c052ad30251bab53da253c2af5d70e9253c7861daa2e1a
  Stored in directory: /root/.cache/pip/wheels/ac/44/65/cfd4a1f4178d2892a595c330bc6b83d6f6b8ff727f6c4cf030
Successfully built html5-parser
Installing collected packages: html5-parser
Successfully

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

In [None]:

def get_all_website_reviews():
  page = 1
  df = pd.DataFrame(columns=['overall_rating', 'name', 'country', 'date_created', 'review_title','review_text', 'aircraft', 'Value For Money'])
  not_on_last_page = True
  while not_on_last_page:
    print("reviewing page: " + str(page))
    reviews_df = review_page(page)
    num_items = len(reviews_df)
    if num_items == 0:
      not_on_last_page = False
    else:
      print("This page has " + str(num_items) + " reviews")
      df = pd.concat([df,reviews_df], ignore_index=True)
      page += 1
  return df



def review_page(page_number):
  df = pd.DataFrame(columns=['overall_rating', 'name', 'country', 'date_created', 'review_title','review_text', 'aircraft', 'Value For Money'])
  page = requests.get(f'https://www.airlinequality.com/airline-reviews/british-airways/page/{page_number}/?sortby=post_date%3ADesc&pagesize=100')
  html_content = BeautifulSoup(page.content, 'html.parser')

  all_reviews = get_all_reviews(html_content)
  for i in range(len(all_reviews)):
    ##print("reviewing page: " + str(page_number) + ", item: " + str(i))
    review = all_reviews[i]
    review_details_df = scrape_review(review)
    df = pd.concat([df, review_details_df], ignore_index=True)

  return df


def get_all_reviews(html_content):
  return html_content.find_all('article', itemprop='review')

def scrape_review(review):
  review_details = {}
  # scrape rating
  review_details["overall_rating"] = scrape_overall_rating(review)

  # scrape user_details
  user_details = scrape_user_details(review)
  review_details["name"] = user_details["name"].strip("\n")
  review_details["country"] = user_details["country"]
  review_details["date_created"] = user_details["date_created"]

  # scrape review_title
  review_details["review_title"] = scrape_review_title(review)

  # scrape review_text
  review_details["review_text"] = scrape_review_text(review)

  #scrape Aircraft
  review_details["aircraft"] = scrape_table_item(review, "Aircraft")

  #scrape Value For Money
  review_details["Value For Money"] = scrape_table_item_rating(review, "Value For Money")

  return pd.DataFrame([review_details])



def scrape_overall_rating(review):
  element = review.find('span', itemprop='ratingValue')
  if not element:
    return "N/A"
  else:
    return int(element.text)



def scrape_user_details(review):
  user_details = review.find("h3", class_="text_sub_header userStatusWrapper")
  country = "None"
  country_match = re.findall(r'\((.*?)\)', user_details.text)
  if country_match:
    country = country_match[0]
  name = user_details.find("span", itemprop="author").text
  date_created = user_details.find("time", itemprop="datePublished")['datetime']
  return {"name": name, "country": country, "date_created": date_created}

def scrape_review_title(review):
  return review.find("h2", class_="text_header").text.strip('"')

def scrape_review_text(review):
  div_element =  review.find("div", itemprop="reviewBody")

  texts = []

  # Iterate over the contents of the div, checking for NavigableString instances
  for content in div_element.contents:
      if isinstance(content, str):
          # Strip the string of extra spaces and unwanted characters
          text = content.strip(' "|\n\t✅')
          if text:
              texts.append(text)

  # Join the text parts
  return ' '.join(texts)

def scrape_table_item(review, table_item):
  table_item_td = review.find('td', string=table_item)
  if table_item_td == None:
    return ""
  else:
    table_item_value_td = table_item_td.find_next_sibling('td')
    return table_item_value_td.text

def scrape_table_item_rating(review, table_item):
  table_item_td = review.find('td', string=table_item)
  if table_item_td == None:
    return ""
  else:
    table_item_rating_td = table_item_td.find_next_sibling('td')
    return scrape_rating(table_item_rating_td)

def scrape_rating(td_element):
  current_rating = 5
  rating_not_found = True
  if td_element.text == "N/A":
    return "N/A"
  else:
    while rating_not_found:
      span_element = td_element.find("span", string=current_rating)
      if "fill" in span_element["class"]:
        rating_not_found = False
      else:
        current_rating -= 1
    return current_rating



df = get_all_website_reviews()
#df = review_page(32)
df

reviewing page: 1
This page has 100 reviews
reviewing page: 2
This page has 100 reviews
reviewing page: 3
This page has 100 reviews
reviewing page: 4
This page has 100 reviews
reviewing page: 5
This page has 100 reviews
reviewing page: 6
This page has 100 reviews
reviewing page: 7
This page has 100 reviews
reviewing page: 8
This page has 100 reviews
reviewing page: 9
This page has 100 reviews
reviewing page: 10
This page has 100 reviews
reviewing page: 11
This page has 100 reviews
reviewing page: 12
This page has 100 reviews
reviewing page: 13
This page has 100 reviews
reviewing page: 14
This page has 100 reviews
reviewing page: 15
This page has 100 reviews
reviewing page: 16
This page has 100 reviews
reviewing page: 17
This page has 100 reviews
reviewing page: 18
This page has 100 reviews
reviewing page: 19
This page has 100 reviews
reviewing page: 20
This page has 100 reviews
reviewing page: 21
This page has 100 reviews
reviewing page: 22
This page has 100 reviews
reviewing page: 23


Unnamed: 0,overall_rating,name,country,date_created,review_title,review_text,aircraft,Value For Money
0,9,R Dayle,[United Kingdom],2024-01-07,we were pleased with the service,First time using BA business class but we were...,Boeing 777 -200,4
1,6,K Higgins,[United States],2024-01-03,Gate agent was extremely rude,Extremely rude ground service. We were non-rev...,,2
2,1,E Gan,[China],2024-01-02,incredibly heartless and incompetent company,My son and I flew to Geneva last Sunday for a ...,,1
3,8,S Dayle,[United Kingdom],2023-12-29,their service was hit-and-miss,For the price paid (bought during a sale) it w...,A320,4
4,6,S Neale,[United Kingdom],2023-12-29,Worse than a low-cost carrier,Flight left on time and arrived over half an h...,A320,2
...,...,...,...,...,...,...,...,...
3725,6,W Benson,[United Kingdom],2012-08-29,British Airways customer review,HKG-LHR in New Club World on Boeing 777-300 - ...,,
3726,9,Nick Berry,[United Kingdom],2012-08-28,British Airways customer review,LHR to HAM. Purser addresses all club passenge...,,3
3727,5,Avril Barclay,[United Kingdom],2011-10-12,British Airways customer review,My son who had worked for British Airways urge...,,4
3728,4,C Volz,[United States],2011-10-11,British Airways customer review,London City-New York JFK via Shannon on A318 b...,,1
