In [94]:
import pandas as pd
import numpy as np
from selenium import webdriver

In [161]:
class SkytraxScraper():
    
    def __init__(self, url):
        self.driver = webdriver.Chrome()
        self.driver.get(url)
        self.data = pd.DataFrame()
      
    
    def get_reviews(self):
        self.current_reviews = self.driver.find_elements_by_xpath("//article[@itemprop='review']")
    
    
    @staticmethod
    def extract_content(review):
        date = review.find_element_by_xpath(".//time").get_attribute("datetime")
        title = review.find_element_by_xpath(".//h2[@class='text_header']").text.replace('"', "")
        content = review.find_element_by_xpath(".//div[@class='text_content ']").text.replace("✅ Trip Verified |", "")
        comment = ".".join([title, content])
        rating = review.find_element_by_xpath(".//span[@itemprop='ratingValue']").text
        
        return [date, comment, rating]
        
        
    def scrape_current_page(self):
        self.get_reviews()
        for review in self.current_reviews:
            content = self.extract_content(review)
            self.data = self.data.append([content])
        
        
    def go_to_next_page(self):
            self.driver.find_element_by_partial_link_text(">>").click()
    
    
    def scrape(self, n_pages=np.inf):
        current_page_no = 1
        while current_page_no <= n_pages:
            print(f"Scraping page no {current_page_no}")
            self.scrape_current_page()
            try:
                self.go_to_next_page()
            except:
                print("Last page reached.")
                break
            current_page_no += 1
        self.data.columns = ["date", "comment", "rating"]
        self.data.reset_index(inplace=True, drop=True)
        
        return self.data

In [140]:
scraper = SkytraxScraper("https://www.airlinequality.com/airline-reviews/air-france/")

In [141]:
scraper.scrape(n_pages=3)

Scraping page no 1
Scraping page no 2
Last page reached.


Unnamed: 0,date,comment,rating
0,2013-07-09,Air France customer review.CAI-CDG May 19 2013...,6
0,2013-07-08,Air France customer review.Although we bought ...,3
0,2013-07-07,Air France customer review.Travelled in premiu...,3
0,2013-07-07,Air France customer review.Went from JFK to CD...,9
0,2013-07-05,Air France customer review.Fly from Birmingham...,7
0,2013-07-04,Air France customer review.Flew from Rio/GIG t...,8
0,2013-07-04,Air France customer review.San Francisco to Pa...,2
0,2013-07-03,Air France customer review.BCN-CDG return flig...,6
0,2013-06-25,Air France customer review.Guangzhou - Paris 1...,6
0,2013-06-23,Air France customer review.BKK - LHR with shor...,8


In [126]:
scraper.get_reviews()

In [127]:
scraper.scrape_current_page()

In [128]:
scraper.data

Unnamed: 0,0,1,2
0,2020-10-19,refused to let my mother board a flight. On 10...,1
0,2020-10-15,I was very impressed. I was very impressed wit...,10
0,2020-10-15,not allowed on board. When buying tickets Ista...,2
0,2020-10-11,never fly with Air France again. I purchased a...,1
0,2020-10-09,I will have to wait for 3 months. I had a very...,1
0,2020-10-04,Air France can do much better on the food. So ...,7
0,2020-10-04,service was extremely nice and polite. There w...,9
0,2020-09-09,They will check your bag weight. If your 2 pie...,1
0,2020-09-08,no excuse for this horrible service. In the le...,1
0,2020-08-24,inflight service was good. Marseilles to Athen...,9


In [142]:
review = scraper.current_reviews[0]
content = scraper.extract_content(review)

In [34]:
scraper.go_to_next_page()

In [129]:
np.inf

inf

In [130]:
1 < np.inf

True

In [144]:
type(review)

selenium.webdriver.remote.webelement.WebElement

In [170]:
data = pd.DataFrame([], columns=["date", "comment", "rating"]) 

In [162]:
scraper = SkytraxScraper("https://www.airlinequality.com/airline-reviews/air-france/")
reviews = scraper.scrape(n_pages=3)

Scraping page no 1
Scraping page no 2
Scraping page no 3


In [165]:
reviews

Unnamed: 0,date,comment,rating
0,2020-10-19,refused to let my mother board a flight. On 10...,1
1,2020-10-15,I was very impressed. I was very impressed wit...,10
2,2020-10-15,not allowed on board. When buying tickets Ista...,2
3,2020-10-11,never fly with Air France again. I purchased a...,1
4,2020-10-09,I will have to wait for 3 months. I had a very...,1
5,2020-10-04,Air France can do much better on the food. So ...,7
6,2020-10-04,service was extremely nice and polite. There w...,9
7,2020-09-09,They will check your bag weight. If your 2 pie...,1
8,2020-09-08,no excuse for this horrible service. In the le...,1
9,2020-08-24,inflight service was good. Marseilles to Athen...,9


In [171]:
data = data.append(reviews, ignore_index=True)

In [167]:
data.head()

Unnamed: 0,date,comment,rating,index
0,2020-10-19,refused to let my mother board a flight. On 10...,1,0.0
1,2020-10-15,I was very impressed. I was very impressed wit...,10,0.0
2,2020-10-15,not allowed on board. When buying tickets Ista...,2,0.0
3,2020-10-11,never fly with Air France again. I purchased a...,1,0.0
4,2020-10-09,I will have to wait for 3 months. I had a very...,1,0.0


In [174]:
data.shape

(30, 3)

In [175]:
scraper = SkytraxScraper("https://www.airlinequality.com/airline-reviews/delta-air-lines")
reviews = scraper.scrape(n_pages=3)

Scraping page no 1
Scraping page no 2
Scraping page no 3


In [176]:
    data = data.append(reviews, ignore_index=True)

In [177]:
data.head()

Unnamed: 0,date,comment,rating
0,2020-10-19,refused to let my mother board a flight. On 10...,1
1,2020-10-15,I was very impressed. I was very impressed wit...,10
2,2020-10-15,not allowed on board. When buying tickets Ista...,2
3,2020-10-11,never fly with Air France again. I purchased a...,1
4,2020-10-09,I will have to wait for 3 months. I had a very...,1


In [178]:
data.shape

(60, 3)

In [179]:
data.to_csv("../data/test.csv", index=False)