### Installing and Importing Libraries

In [1]:
# %pip install Unidecode
# %pip install selenium-stealth
# %pip install undetected-chromedriver
# %pip install webdriver-manager
# %pip install python-Levenshtein

In [8]:
import csv
import io
import os
import random
import time
from pprint import pprint
import re

import Levenshtein
import numpy as np
import pandas as pd
import unidecode
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium_stealth import stealth
import undetected_chromedriver as uc

import pickle
from selenium.webdriver.support.ui import WebDriverWait
from selenium_stealth import stealth
from webdriver_manager.chrome import ChromeDriverManager

from sklearn.model_selection import train_test_split

### Setting up webdrivers
- configures the driver to ignore SSL certificate errors
- open pages in incognito mode
- operate in headless mode (without opening a browser window)

In [3]:
def initialize_webdriver():
    print("Starting to initialize webdriver...")
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    print("Webdriver initialized.")
    return driver

### Scraping Links to Airline Review from Tripadvisor

*Note: 
In scraping airline review links, I initially targeted TripAdvisor but encountered robust security barriers like IP rate limiting, captchas, and bot detection. Despite efforts using proxies, rotating user agents, and advanced scraping tools, bypassing these measures proved unsuccessful. Consequently, I pivoted to TrustPilot. While TrustPilot's content is more diverse, making specific airline review scraping less straightforward, I utilized a pre-collected list of airline names from TripAdvisor to efficiently extract review links from TrustPilot.*

In [82]:
def scrape_page(driver, csv_writer):
    try:
        airlines = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".prw_rup.prw_airlines_airline_lander_card"))
        )
        for airline in airlines:
            name = airline.find_element(By.CSS_SELECTOR, ".airlineName").text
            review_link_element = airline.find_element(By.CSS_SELECTOR, "a.review_button.ui_button.secondary.small")
            review_link = review_link_element.get_attribute("href")
            num_reviews = airline.find_element(By.CSS_SELECTOR, ".airlineReviews").text
            csv_writer.writerow([name, review_link, num_reviews])

    except (NoSuchElementException, TimeoutException):
        print("Error occurred while scraping.")

In [83]:
def main1():
    driver = initialize_webdriver()
    driver.get("https://www.tripadvisor.com/Airlines")
    
    with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Airline Name', 'Review Link', 'Number of Reviews'])

        page_number = 1
        while True:
            print(f"Scraping page {page_number}...")
            scrape_page(driver, csv_writer)

            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "span.nav.next.ui_button.primary"))
                )
                if "disabled" in next_button.get_attribute("class"):
                    print("Reached the last page.")
                    break

                next_button.click()
                WebDriverWait(driver, 10).until(EC.staleness_of(next_button))
                page_number += 1
                time.sleep(1)

            except (NoSuchElementException, TimeoutException):
                print("Reached the end of the pages or encountered an error.")
                break

    driver.quit()

In [86]:
main1()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Error occurred while scraping.
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Error occurred while scraping.
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Error occurred while scraping.
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Error occurred while scraping.
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Error occurred while scraping.
Scraping page 41...
Scraping page 42...
Error occurred

#### ⚠️ Deprecated TripAdvisor Review Scraping Approach (Switched to TrustPilot)

Despite extensive efforts to navigate TripAdvisor's security, including user agent rotation, utilizing a variety of proxies (though limited by the efficacy of free proxies), VPN for changing servers, and manual captcha resolution, the attempts faced significant roadblocks. The site's security measures effectively limited scraping activities to merely 3-4 page accesses (driver.get() operations) before necessitating an IP change via VPN. This made it very challenging to get a lot of data especially because each page only contained 5 reviews. Manual captcha interventions were of no use as after solving the captcha i would immediately get blocked. Moreover, any progress made by scraping an initial page of reviews was quickly stopped by the bot detection from accessing subsequent pages. Using tools like selenium stealth and undetected chrome agent also proved to be inaffective.

One solution could have been to buy a list of paid proxies and rotate through these every time tripadvisor blocks us from the page however these proxies are quite expensive.

In [None]:
def initialize_webdriver():
    print("started to init")
    options = webdriver.ChromeOptions()
    ua = UserAgent()
    my_user_agent = ua.chrome
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    options.add_argument('--start-maximized')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--profile-directory=Default')
    options.add_argument('--disable-plugins-discovery')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service = Service(ChromeDriverManager().install()), options=options)
    print("init complete")
    return driver

In [None]:
def click_read_more(driver):
    read_more_buttons = WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located(
            (By.CSS_SELECTOR, "div.lszDU div.TnInx[data-test-target='expand-review']")
        )
    )

    for button in read_more_buttons:
        try:
            button.click()
            time.sleep(4)
            print("button clicked !!!")
            WebDriverWait(driver, 10).until(EC.staleness_of(button))
        except Exception as e:
            print("Warning: Button Click Error")

In [None]:
def random_mouse_movement(driver):
    action = ActionChains(driver)
    action.move_by_offset(random.randint(0, 100), random.randint(0, 100))
    action.perform()

In [None]:
def scrape_webpage(driver, url):
    print("started scraping")
    driver.get(url)
    all_reviews = []
    output_file = 'scraped_airline_reviews.csv'
    
    while True:
        click_read_more(driver)
        time.sleep(10)
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "lgfjP.Gi.z.pBVnE.MD.bZHZM")))
        elements = driver.find_elements(By.CLASS_NAME, "lgfjP.Gi.z.pBVnE.MD.bZHZM")
        elements_html = [element.get_attribute('outerHTML') for element in elements]

        all_reviews.extend([parse_review(html, url) for html in elements_html if html is not None])
        save_reviews_to_csv(all_reviews, output_file)
        
        try:
            next_button = driver.find_element(By.CLASS_NAME, "nav.next.primary")
            if 'disabled' in next_button.get_attribute('class'):
                break  # If the Next button is disabled, exit the loop
            next_button.click()
            WebDriverWait(driver, 10).until(EC.staleness_of(next_button))
            print("Navigated to next page")
        except NoSuchElementException:
            print("No more pages found.")
            break
            
        random_mouse_movement(driver)
        random_sleep(2, 5)

    print("scraping done")
    return all_reviews

In [None]:
existing_reviews = set()

def parse_review(html, url):
    soup = BeautifulSoup(html, 'html.parser')
    review_data = {}
    output_file = 'scraped_airline_reviews.csv'
    
    review_data['airline_url'] = url
    user_info = soup.find('a', class_='ui_header_link')
    if user_info:
        user_name = user_info.get_text(strip=True)
        review_date_div = soup.find('div', class_='cRVSd')
        if review_date_div:
            review_date = review_date_div.get_text()
            unique_id = (url, user_name, review_date)
            if unique_id in existing_reviews:
                return None
            existing_reviews.add(unique_id)
            review_data['user_name'] = user_name
            review_data['review_date'] = review_date
            review_data['scraped_date'] = datetime.today().strftime('%Y-%m-%d')
        else:
            return None
    else:
        return None

    travel_details = soup.find_all('div', class_='dmRSR n R2 S2')
    review_data['travel_details'] = [detail.get_text(strip=True) for detail in travel_details]

    review_title = soup.find('div', class_='KgQgP MC _S b S6 H5 _a')
    review_data['review_title'] = review_title.get_text(strip=True) if review_title else None
    review_text = soup.find('span', class_='QewHA H4 _a')
    review_data['review_text'] = review_text.get_text(strip=True) if review_text else None
    
    overall_rating = soup.find('span', class_='ui_bubble_rating')
    review_data['overall_rating'] = int(overall_rating['class'][1].split('_')[-1]) // 10 if overall_rating else None

    subratings = soup.find_all('div', class_='hemdC S2 H2')
    review_data['subratings'] = {}
    for subrating in subratings:
        category = subrating.get_text(strip=True)
        rating_class = subrating.find('span', class_='ui_bubble_rating')['class'][1]
        review_data['subratings'][category] = int(rating_class.split('_')[-1]) // 10
        
    return review_data

In [None]:
def save_reviews_to_csv(reviews, file_path):
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=reviews[0].keys())
        
        if not file_exists:
            writer.writeheader() 
        
        for review in reviews:
            writer.writerow(review)
            
def check_for_captcha(driver):
    try:
        captcha_iframe = driver.find_element(By.TAG_NAME, "iframe")
        if "captcha" in captcha_iframe.get_attribute("src"):
            return True
    except NoSuchElementException:
        return False

def create_batches(df, batch_size):
    for i in range(0, len(df), batch_size):
        yield df[i:i + batch_size]

def read_processed_links(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return set(line.strip() for line in file)
    return set()

def add_processed_link(file_path, link):
    with open(file_path, 'a') as file:
        file.write(link + '\n')
        
def save_reviews_to_csv(reviews, file_path):
    file_exists = os.path.isfile(file_path)
    
    with open(file_path, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=reviews[0].keys())
        
        if not file_exists:
            writer.writeheader() 
        
        for review in reviews:
            writer.writerow(review)

In [None]:
def get_next_button(driver):
    try:
        return driver.find_element(By.CLASS_NAME, "nav.next.primary")
    except NoSuchElementException:
        return None

def random_sleep(min_sec, max_sec):
    time.sleep(random.uniform(min_sec, max_sec))

def main():
    meta_df = pd.read_csv("./scraped_data.csv")
    batch_size = 1
    processed_links_file = 'processed_links.txt'
    processed_links = read_processed_links(processed_links_file)
    output_file = 'scraped_airline_reviews.csv'

    driver = initialize_webdriver()
    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
    )

    for batch in create_batches(meta_df, batch_size):
        for review_link in batch['Review Link']:
            if review_link in processed_links:
                print(f"Skipping already processed link: {review_link}")
                continue

            print(f'Starting to scrape: {review_link}')
            random_sleep(15, 20) 

            driver.get(review_link)
            while True:
                if check_for_captcha(driver):
                    input("CAPTCHA detected. Please solve it and then press Enter to continue...")
                    time.sleep(5)

                elements_html = scrape_webpage(driver, review_link)
                all_reviews = [review for review in (parse_review(html, review_link) for html in elements_html) if review and (review.get('review_text') or review.get('overall_rating'))]

                if all_reviews:
                    save_reviews_to_csv(all_reviews, output_file)

                next_button = get_next_button(driver)
                if next_button and 'disabled' not in next_button.get_attribute('class'):
                    href = next_button.get_attribute('href')
                    print(f"Navigating to: {href}")
                    next_button.click()
                    random_sleep(5, 10)
                else:
                    print("No more pages to navigate or Next button is disabled.")
                    break

            add_processed_link(processed_links_file, review_link)

    driver.quit()

main()

### Collecting the links to each airline reviews from trustpilot

- The code defines a function to search for and collect links from Trustpilot based on airline names from a CSV file which was obtained by scraping tripadvisor.
- It skips queries already processed which is kept track of using a set.
- Selenium is used to interact with the website and extract the links
- Levenshtein distance checks for close matches between query and result titles.
- Found links are appended to a CSV, maintaining a record of processed queries.
- A cleaning function is used to standardize text for better matching.
- The main part reads an airline dataset, initializes the WebDriver, identifies the last processed query, and runs the link-finding function for unprocessed queries.

In [4]:
meta_df = pd.read_csv("./karthik_datasets/scraped_data.csv")
meta_df.head()

Unnamed: 0,Airline Name,Review Link,Number of Reviews
0,Adria Airways [no longer operating],https://www.tripadvisor.com/Airline_Review-d87...,670 reviews
1,Advanced Air,https://www.tripadvisor.com/Airline_Review-d17...,18 reviews
2,AEGEAN,https://www.tripadvisor.com/Airline_Review-d87...,"13,439 reviews"
3,Aer Lingus,https://www.tripadvisor.com/Airline_Review-d87...,"13,330 reviews"
4,Aero Mongolia,https://www.tripadvisor.com/Airline_Review-d13...,18 reviews


In [5]:
def find_links_for_query(driver, query, csv_file, processed_queries, start_index=0):
    for i, query in enumerate(df["Airline Name"][start_index:], start=start_index):
        if query in processed_queries:
            print(f"Skipping query: {query} (already processed)")
            continue

        driver.get("https://www.trustpilot.com/search?query=s")
        time.sleep(3)

        search_input = driver.find_element(By.CLASS_NAME, 'search-desktop_searchInputField__CHV3l')
        search_input.clear()
        search_input.send_keys(query)
        search_input.send_keys(Keys.RETURN)

        time.sleep(3)

        elements = driver.find_elements(By.CLASS_NAME, 'paper_paper__1PY90.paper_outline__lwsUX.card_card__lQWDv.card_noPadding__D8PcU.styles_wrapper__2JOo2.styles_businessUnitResult__L3bbC')

        for element in elements:
            title_element = element.find_element(By.CLASS_NAME, 'typography_heading-xs__jSwUz.typography_appearance-default__AAY17.styles_displayName__GOhL2').text
            levenshtein_distance = Levenshtein.distance(clean_text(query), clean_text(title_element))
            if levenshtein_distance <= 0.3:
                link_element = element.find_element(By.XPATH, './/a[@class="link_internal__7XN06 link_wrapper__5ZJEx styles_linkWrapper__UWs5j"]')
                link = link_element.get_attribute("href")
                data = {'Query': query, 'Link': link}
                pd.DataFrame([data]).to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False)

        processed_queries.add(query)

def clean_text(text):
    text = unidecode.unidecode(text).lower()
    cleaned_text = re.sub(r'\s*[\(\[\{][^\)\]\}]*[\)\]\}]\s*', ' ', text)
    return re.sub(r'\s+', ' ', cleaned_text).strip().lower()


existing_data = pd.read_csv('./karthik_datasets/results.csv')
processed_queries = set(existing_data['Query'])

In [None]:
df = pd.read_csv('karthik_datasets/scraped_data.csv')
df['Airline Name'] = df['Airline Name'].apply(clean_text)

driver = initialize_webdriver()

last_processed_index = df[df['Airline Name'].isin(processed_queries)].index.max() or 0

find_links_for_query(driver, df["Airline Name"], 'results.csv', processed_queries, last_processed_index)

driver.quit()

Starting to initialize webdriver...
Webdriver initialized.
Skipping query: yeti airlines (already processed)


### Scraping the airline reviews from trustpilot

- navigate_to_reviews_page: Loads a given URL in the browser.
- scroll_to_element: Scrolls the browser to make the next button element visible.
- get_next_page_href: Checks for a "next page" button and returns its link if available; otherwise, returns None.
- extract_reviews_data: Collects review details from the current page and stores them in a list.
- main: Orchestrates the scraping process for reviews from a given URL, saving the data to a CSV file and navigating through pages until no more are found.
- update_status: Updates the status of a link in a DataFrame to track progress.
- process_links: Iterates through links in a provided CSV, scraping reviews for each and updating their status, indicating whether data collection was successful or encountered errors.

In [10]:
def navigate_to_reviews_page(driver, url):
    driver.get(url)
    time.sleep(3) 

In [11]:
def scroll_to_element(driver, element):
    driver.execute_script("arguments[0].scrollIntoView();", element)

def get_next_page_href(driver):
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@name='pagination-button-next']"))
        )
        if next_button.get_attribute('aria-disabled') == 'true':
            return None  # Button is disabled, no more pages
        else:
            scroll_to_element(driver, next_button)  # Scroll to make the button visible
            return next_button.get_attribute('href')  # Return the href attribute
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [12]:
def extract_reviews_data(driver):
    all_reviews = []

    review_elements = driver.find_elements(By.CSS_SELECTOR, '.styles_cardWrapper__LcCPA')

    for review in review_elements:
        single_review = {
            'Stars': review.find_element(By.CSS_SELECTOR, '.star-rating_starRating__4rrcf img').get_attribute('alt'),
            'Post Title': review.find_element(By.CSS_SELECTOR, '.typography_heading-s__f7029').text,
            'Person': review.find_element(By.CSS_SELECTOR, '.typography_heading-xxs__QKBS8').text,
            'Date Posted': review.find_element(By.CSS_SELECTOR,'time').get_attribute('title'),
            'Date of Experience': review.find_element(By.CSS_SELECTOR, '.typography_body-m__xgxZ_.typography_appearance-default__AAY17').text,
            'Review Text': review.find_element(By.CSS_SELECTOR, '.typography_body-l__KUYFJ').text
        }
        all_reviews.append(single_review)
    
    return all_reviews

In [16]:
def main(url):
    driver = initialize_webdriver()
    navigate_to_reviews_page(driver, url)
    
    file_exists = os.path.isfile('reviews.csv')

    with open('testing.csv', 'a', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)

        if not file_exists:
            csv_writer.writerow(['Stars', 'Post Title', 'Person', 'Date Posted', 'Date of Experience', 'Review Text', 'Link'])

        while True:
            data = extract_reviews_data(driver)

            for review in data:
                review['Link'] = url
                csv_writer.writerow([review['Stars'], review['Post Title'], review['Person'], review['Date Posted'], review['Date of Experience'], review['Review Text'], review['Link']])

            next_page_url = get_next_page_href(driver)
            if not next_page_url:
                print(review['Link'], "scraping finished")
                break  

            driver.get(next_page_url)
            time.sleep(3)

    driver.quit()

In [17]:
test_Url = "https://www.trustpilot.com/review/www.butaairways.az"
main(test_Url)

Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/www.butaairways.az scraping finished


In [16]:
def update_status(df, link, status):
    df.loc[df['Link'] == link, 'Status'] = status
    return df

def process_links(file_path):
    df = pd.read_csv(file_path)

    if 'Status' not in df.columns:
        df['Status'] = 'Not Collected'

    for index, row in df.iterrows():
        if row['Status'] == 'Collected':
            continue

        try:
            main(row['Link'])
            df = update_status(df, row['Link'], 'Collected')
        except Exception as e:
            print(f"Error processing {row['Link']}: {e}")
            df = update_status(df, row['Link'], 'Error')
            continue
            
        df.to_csv(file_path, index=False)

    return df

process_links('results.csv')

Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/m.vueling.com scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/www.westjet.com scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/westjet.ca scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/www.wideroe.no scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/www.wingo.ch scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/wingo.com scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/wingo.store scraping finished
Starting to initialize webdriver...
Webdriver initialized.
https://www.trustpilot.com/review/wizzair.uk scraping finished
Starting t

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Query,Link,Status
0,0,0,aer lingus,https://www.trustpilot.com/review/www.aerlingu...,Collected
1,1,1,aeroflot,https://www.trustpilot.com/review/aeroflot.com,Collected
2,2,2,aerolineas argentinas,https://www.trustpilot.com/review/aerolineas.c...,Collected
3,3,3,aeromexico,https://www.trustpilot.com/review/aeromexico.com,Collected
4,4,4,aeromexico,https://www.trustpilot.com/review/aeromexico.pro,Collected
...,...,...,...,...,...
326,334,334,wow air,https://www.trustpilot.com/review/wowair.is,Collected
327,335,335,wow air,https://www.trustpilot.com/review/wowair.com,Collected
328,336,336,wow air,https://www.trustpilot.com/review/wowair.dk,Collected
329,337,337,xiamen airlines,https://www.trustpilot.com/review/xiamenair.com,Collected


### merging Airline Name column to the reviews.csv
- Merge DataFrames on the 'Link' column, retaining all rows from reviews and only the 'Link' and 'Query' (airline name) columns from results. The merge is done using a left join, meaning all entries from reviews will be kept, even if there's no matching entry in results.

In [21]:
reviews = pd.read_csv("reviews.csv")
results = pd.read_csv("results.csv")

merged_df = pd.merge(reviews, results[['Link', 'Query']], on='Link', how='left')

merged_df.drop('Link', axis=1, inplace=True)
merged_df.to_csv("reviews.csv", index=False)

In [90]:
merged_df.head()

Unnamed: 0,Stars,Post Title,Person,Date Posted,Date of Experience,Review Text,Query
0,Rated 1 out of 5 stars,One of the worst airline ever ever,sharam Salih,"Friday, January 26, 2024 at 12:22:25 AM","Date of experience: December 28, 2023",One of the worst airline ever\nShocked with 2 ...,aer lingus
1,Rated 1 out of 5 stars,They need to build stronger aircraft or…,C L,"Monday, January 22, 2024 at 11:36:05 PM","Date of experience: January 21, 2024",They need to build stronger aircraft or someth...,aer lingus
2,Rated 1 out of 5 stars,Avoid!! get insulted with customer service,Senhao Zhang,"Sunday, January 21, 2024 at 11:12:17 PM","Date of experience: January 21, 2024",The guy who picked up my was so rude and even ...,aer lingus
3,Rated 4 out of 5 stars,Cancelled flight,John Mc.,"Thursday, January 25, 2024 at 12:52:51 AM","Date of experience: January 07, 2024",I recently had a flight cancellation due to fo...,aer lingus
4,Rated 1 out of 5 stars,I love travelling with Aer Lingus,Mary Mckeegan,"Friday, January 19, 2024 at 11:18:02 PM","Date of experience: January 19, 2024","I love travelling with Aer Lingus, no problem ...",aer lingus


### Merging raw datasets (group portion)

Here the raw training and testing datasets are created. The only steps that were performed were: 
- renaming columns
- dropping unmatched columns
- appending the datasets

In [44]:
pin_shien = pd.read_csv("pinshien_airline_scraped_data.csv")
karthik = pd.read_csv("karthik_airline_scraped_data.csv")
junming = pd.read_csv("junming_airline_scrapped_data.csv")

In [45]:
karthik.columns

Index(['Stars', 'Post Title', 'Person', 'Date Posted', 'Date of Experience',
       'Review Text', 'Query'],
      dtype='object')

In [46]:
karthik.drop(columns=["Post Title", "Date of Experience", "Person"], inplace=True)

In [47]:
pin_shien.columns

Index(['Passenger Name', 'Review Date', 'Airline Name',
       'Airline Average Rating', 'Cabin Flown', 'Passenger Overall Rating',
       'Overall Value for Money', 'Seat and Cabin Space', 'Customer Service',
       'In Flight Entertainment', 'Baggage Handling', 'Check-in Process',
       'Meals and Beverages', 'Recommend Airline', 'Passenger Review Text'],
      dtype='object')

In [48]:
pin_shien.drop(columns=["Airline Average Rating", "Cabin Flown", "Overall Value for Money", "Seat and Cabin Space", "Customer Service", "In Flight Entertainment", "Baggage Handling", "Check-in Process", "Meals and Beverages", "Recommend Airline", "Passenger Name"], inplace=True)

In [49]:
pin_shien.rename(columns={
    "Review Date": "review_date",
    "Airline Name": "airline_name",
    "Passenger Overall Rating": "overall_rating",
    "Passenger Review Text": "review_text"
}, inplace=True)

In [50]:
karthik.rename(columns={
    "Person": "passenger_name",
    "Date Posted": "review_date",
    "Query": "airline_name",
    "Stars": "overall_rating",
    "Review Text": "review_text"
}, inplace=True)

In [51]:
junming.head()

Unnamed: 0,status,aircraft,travel_type,travel_class,route,date,seating_comfort,staff_service,food_quality,entertainment,wifi,ground_service,value_for_money,recommended,overall_rating,review,airline_name
0,✅ Trip Verified,,Family Leisure,Business Class,London Heathrow to Miami,December 2023,2.0,1.0,2.0,2.0,,3.0,2,no,2,Stinking nappies being changed in business ca...,british airways
1,✅ Trip Verified,,Solo Leisure,Economy Class,Boston to Düsseldorf via London,January 2024,1.0,2.0,1.0,1.0,,1.0,1,no,1,Worst service ever. Lost baggage because of d...,british airways
2,✅ Trip Verified,A350,Business,Economy Class,Sao Paulo to London Heathrow,January 2024,3.0,4.0,3.0,3.0,3.0,3.0,3,no,6,BA 246 21JAN 2023 Did not appreciate the unp...,british airways
3,✅ Trip Verified,A320,Couple Leisure,Economy Class,London Heathrow to Lisbon,January 2024,1.0,4.0,1.0,,,3.0,4,no,3,Not a great experience. I could not check in ...,british airways
4,Not Verified,,Family Leisure,Economy Class,London to Hong Kong,January 2024,1.0,3.0,1.0,1.0,1.0,1.0,2,no,2,I was excited to fly BA as I'd not travelled ...,british airways


In [52]:
junming.rename(columns={
    "date": "review_date",
    "review": "review_text"
}, inplace=True)

In [53]:
junming.columns

Index(['status', 'aircraft', 'travel_type', 'travel_class', 'route',
       'review_date', 'seating_comfort', 'staff_service', 'food_quality',
       'entertainment', 'wifi', 'ground_service', 'value_for_money',
       'recommended', 'overall_rating', 'review_text', 'airline_name'],
      dtype='object')

In [55]:
junming.drop(columns=["status", "aircraft", "travel_type", 'travel_class', 'route', 'seating_comfort', 'staff_service', 'food_quality',
       'entertainment', 'wifi', 'ground_service', 'value_for_money',
       'recommended'], inplace=True)

In [56]:
appended_df = pd.concat([karthik, pin_shien, junming], ignore_index=True)
appended_df

Unnamed: 0,overall_rating,review_date,review_text,airline_name
0,Rated 1 out of 5 stars,"Friday, January 26, 2024 at 12:22:25 AM",One of the worst airline ever\nShocked with 2 ...,aer lingus
1,Rated 1 out of 5 stars,"Monday, January 22, 2024 at 11:36:05 PM",They need to build stronger aircraft or someth...,aer lingus
2,Rated 1 out of 5 stars,"Sunday, January 21, 2024 at 11:12:17 PM",The guy who picked up my was so rude and even ...,aer lingus
3,Rated 4 out of 5 stars,"Thursday, January 25, 2024 at 12:52:51 AM",I recently had a flight cancellation due to fo...,aer lingus
4,Rated 1 out of 5 stars,"Friday, January 19, 2024 at 11:18:02 PM","I love travelling with Aer Lingus, no problem ...",aer lingus
...,...,...,...,...
82173,1,December 2023,"Flight was cancelled, which is ok if you got ...",WestJet Airlines
82174,1,December 2023,"Flight was cancelled, which is ok if you got ...",WestJet Airlines
82175,1,December 2023,"Flight was cancelled, which is ok if you got ...",WestJet Airlines
82176,1,December 2023,"Flight was cancelled, which is ok if you got ...",WestJet Airlines


In [60]:
df_80, df_20 = train_test_split(appended_df, test_size=0.2, random_state=42)

df_80.to_csv("training_data.csv")
df_20.to_csv("testing_data.csv")

In [91]:
df_80.head()

Unnamed: 0,overall_rating,review_date,review_text,airline_name
23743,Rated 1 out of 5 stars,"Monday, September 18, 2023 at 03:29:15 AM",EasyJet sent text at 4.00 am day of flight hom...,easyjet
82074,9,December 2023,Its been a few years when I flew a lot in A...,Vistara
75513,Rated 5 out of 5 stars,"Wednesday, September 6, 2023 at 01:58:27 AM",Useful,volotea
4699,Rated 1 out of 5 stars,"Sunday, September 9, 2018 at 08:37:11 PM",one of the worst experiences with Air France e...,air france
26212,Rated 1 out of 5 stars,"Monday, December 23, 2019 at 02:00:19 PM",Not a single star this airlines deserves .I lo...,egyptair
