In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [6]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):
    
    print(f"Scraping page {i}")
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    response = requests.get(url)
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
         reviews.append(para.get_text())
    print(f"   ---> {len(reviews)} total reviews")

    

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [7]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | Flew British Airways on BA ...
1,✅ Trip Verified | BA cancelled the flight fro...
2,✅ Trip Verified | I strongly advise everyone t...
3,✅ Trip Verified | My partner and I were on the...
4,Not Verified | We had a Premium Economy retur...


In [8]:
df

Unnamed: 0,reviews
0,✅ Trip Verified | Flew British Airways on BA ...
1,✅ Trip Verified | BA cancelled the flight fro...
2,✅ Trip Verified | I strongly advise everyone t...
3,✅ Trip Verified | My partner and I were on the...
4,Not Verified | We had a Premium Economy retur...
...,...
995,"✅ Trip Verified | Boston to London Heathrow, ..."
996,✅ Trip Verified | London to Cape Town in Firs...
997,✅ Trip Verified | This review is specifically...
998,✅ Trip Verified | London to Aberdeen. Before ...


In [None]:
df.reviews= df.reviews.str.split('|',expand=True)[1]

In [11]:
df

Unnamed: 0,reviews
0,Flew British Airways on BA 434 London Heathr...
1,BA cancelled the flight from Tokyo to LHR. I...
2,I strongly advise everyone to never fly Briti...
3,My partner and I were on the BA2166 return fl...
4,We had a Premium Economy return flight Los A...
...,...
995,"Boston to London Heathrow, was excited to ex..."
996,London to Cape Town in First and our first t...
997,This review is specifically aimed at the exc...
998,London to Aberdeen. Before boarding the flig...


In [12]:
import re

# Define a function to clean the text
def clean(text):
# Removes all special characters and numericals leaving the alphabets
    text = re.sub('[^A-Za-z]+', ' ', str(text))
    return text

# Cleaning the text in the review column
df['Cleaned Reviews'] = df['reviews'].apply(clean)
df.head()

Unnamed: 0,reviews,Cleaned Reviews
0,Flew British Airways on BA 434 London Heathr...,Flew British Airways on BA London Heathrow to...
1,BA cancelled the flight from Tokyo to LHR. I...,BA cancelled the flight from Tokyo to LHR I w...
2,I strongly advise everyone to never fly Briti...,I strongly advise everyone to never fly Briti...
3,My partner and I were on the BA2166 return fl...,My partner and I were on the BA return flight...
4,We had a Premium Economy return flight Los A...,We had a Premium Economy return flight Los An...


In [13]:
import nltk

"""This punkt tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, 
collocations, and words that start sentences. """

nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GANYA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GANYA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GANYA\AppData\Roaming\nltk_data...


In [14]:
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    #print(tags)
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
          newlist.append(tuple([word, pos_dict.get(tag[0])]))
          #print(tag[0])
          #print(pos_dict.get(tag[0]))
    return newlist 

df['POS tagged'] = df['Cleaned Reviews'].apply(token_stop_pos)
df.head()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\GANYA\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\GANYA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Unnamed: 0,reviews,Cleaned Reviews,POS tagged
0,Flew British Airways on BA 434 London Heathr...,Flew British Airways on BA London Heathrow to...,"[(Flew, n), (British, n), (Airways, n), (BA, n..."
1,BA cancelled the flight from Tokyo to LHR. I...,BA cancelled the flight from Tokyo to LHR I w...,"[(BA, n), (cancelled, v), (flight, n), (Tokyo,..."
2,I strongly advise everyone to never fly Briti...,I strongly advise everyone to never fly Briti...,"[(strongly, r), (advise, v), (everyone, n), (n..."
3,My partner and I were on the BA2166 return fl...,My partner and I were on the BA return flight...,"[(partner, n), (BA, n), (return, n), (flight, ..."
4,We had a Premium Economy return flight Los A...,We had a Premium Economy return flight Los An...,"[(Premium, a), (Economy, n), (return, n), (fli..."
