In [46]:
import time
import numpy as np
import pandas as pd
import re

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

**Crawl từ 1 url api**

Cấu trúc URL API của web như sau: 

method: GET

url: https://www.traveloka.com/en-vn/flight/fullsearch?ap={}&dt={}&ps={}&sc={}

params: 

    ap: A.B trong đó A là địa điểm xuất phát, B là điểm đến. VD: DAD.SINA (Danang Airport to Singapore Airport)
    dt: S.E trong đó S là ngày xuất phát, E là ngày đến. VD: 11-5-2023.NA (khởi hành 11/5/2023, ko quan tâm ngày đến nên đặt NA)
    ps: x.y.z trong đó x là số người lớn, y là số trẻ em (4-11 tuổi), z là trẻ nhỏ (dưới 4 tuổi). VD: 1.0.0 (1 người lớn, 0 trẻ em, 0 trẻ nhỏ)
        ---crawl vé đơn nên mỗi lần set giá trị là 1
    sc: hạng chuyến bay, enum {
        "ECONOMY": hạng giá tiết kiệm,
        "PREMIUM_ECONOMY": hạng giá tiết kiệm nhưng "xịn" hơn,
        "BUSINESS": hạng thương gia,
        "FIRST": hạng nhất
    }

In [47]:
def is_ready(browser):
    return browser.execute_script(r"""
        return document.readyState === 'complete'
    """)

In [48]:
destList = [
    "HAN",  # Ha Noi
    "SGN",  # HCM
    "DAD",  # Da Nang
    "VDO",  # Van Don, Quang Ninh
    "HPH",  # Hai Phong
    "VII",  # Vinh, Nghe An
    "HUI",  # Hue
    "CXR",  # Cam Ranh, Khanh Hoa
    "DLI",  # Lam Dong, Da Lat
    "UIH",  # Binh Dinh
    "VCA",  # Can Tho
    "PQC",  # Phu Quoc
    "THD",  # Tho Xuan, Thanh Hoa
    "VDH",  # Dong Hoi, Quang Binh
    "VCL",  # Quang Nam
    "TBB",  # Tuy Hoa, Phu Yen
    "PXU",  # Pleiku, Gia Lai
    "BMV",  # Buon Ma Thuot, Dak Lak
    "VKG",  # Rach Gia, Kien Giang
    "CAH",  # Ca Mau
    "VCS",  # Con Dao, Ba Ria - Vung Tau
]

SClass = ["ECONOMY", "PREMIUM_ECONOMY", "BUSINESS"]

In [49]:
def crawl(sample_size, date):
    depart = "HAN"
    dest = "SGN"
    sclass = "ECONOMY"
    
    options = webdriver.ChromeOptions()
    options.add_argument("headless")
    browser = webdriver.Chrome(options=options)
    browser.get("https://www.traveloka.com/en-vn/flight/fullsearch?ap={}.{}&dt={}.NA&ps=1.0.0&sc={}".format(depart, dest, date, sclass))
    
    WebDriverWait(browser, 2000).until(is_ready)
    time.sleep(3) #web làm bằng react, có fetch data từ server nên phải đợi fetch xong mới load đc

    for _ in range(5):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        
    elements = browser.find_elements(By.CLASS_NAME, "r-135wba7")
    data = []
    for elem in elements[15:-5]:
        if elem.text != "Don't miss out on our member benefits!" and elem.text != "" and elem.text != "No flights available":
            data.append(elem.text)
    data = np.array(data)
    data = data.reshape((-1, 3))
    data = np.array(data)
    print(data)
    
    elements = browser.find_elements(By.CLASS_NAME, "r-rjixqe")
    duration = []
    n_stop = []
    pattern = r'^(\d+h\s)?(\d+m)$'
    for elem in elements[50:]:

        match = re.match(pattern, elem.text)
        if match:
            hours = match.group(1)
            minutes = match.group(2)
    
            if hours:
                hours = int(hours[:-2])
            else:
                hours = 0

            minutes = int(minutes[:-1])
            total_minutes = hours * 60 + minutes
            duration.append(total_minutes)

        if elem.text == "Direct" or elem.text.endswith("stop"):
            strs = elem.text.split(" ")
            if len(strs) == 2:
                n_stop.append(strs[0])
            else:
                n_stop.append("0")
    n_stop = np.array(n_stop).astype(np.int32)
    
    elements = browser.find_elements(By.CLASS_NAME, "r-adyw6z")
    price = []
    for elem in elements:
        if elem.text.endswith("VND"):
            text = elem.text[:-4]
            price.append("".join(text.split(".")))
    price = np.array(price).astype(np.int32)

    arr_depart = [depart for _ in range(len(price))]
    arr_dest = [dest for _ in range(len(price))]
    arr_date = [date for _ in range(len(price))]
    arr_sclass = [sclass for _ in range(len(price))]
    
    df = pd.DataFrame(data=data, columns=["Airline", "Departure_Time", "Arrival_Time"])
    df["Duration"] = duration
    df["Departure"] = arr_depart
    df["Destination"] = arr_dest
    df["Date"] = arr_date
    df["Total_Stops"] = n_stop
    df["SClass"] = arr_sclass
    df["Price"] = price
    
    if (not df.empty) & (df.size > sample_size):
        data = df.loc[(df['Airline'].str.contains(",")) | (df['Arrival_Time'].str.contains("d"))].index
        df.drop(data, inplace=True)
        return df[:sample_size]
    else:
        return "Not enough data"
        

In [50]:
crawl(sample_size=10, date='19-5-2023')

[['Vietravel Airlines' '20:55' '23:10']
 ['Vietravel Airlines' '20:55' '23:10']
 ['VietJet Air' '16:30' '18:40']
 ['VietJet Air' '17:55' '20:05']
 ['VietJet Air' '15:50' '18:00']
 ['VietJet Air' '21:00' '23:10']
 ['VietJet Air' '23:00' '01:10 1d']
 ['VietJet Air' '19:40' '21:50']
 ['VietJet Air' '21:35' '23:45']
 ['VietJet Air' '12:30' '14:40']
 ['VietJet Air' '13:20' '15:30']
 ['VietJet Air' '13:45' '15:55']
 ['VietJet Air' '08:10' '10:20']
 ['Bamboo Airways' '10:45' '12:55']
 ['Bamboo Airways' '21:20' '23:30']
 ['Vietnam Airlines' '06:20' '08:35']
 ['Vietnam Airlines' '10:40' '12:55']
 ['Vietnam Airlines' '13:35' '08:50 1d']
 ['VietJet Air, Bamboo Airways' '11:40' '22:50']
 ['VietJet Air, Bamboo Airways' '05:55' '22:50']
 ['VietJet Air, Vietnam Airlines' '11:40' '22:55']
 ['VietJet Air, Vietnam Airlines' '05:55' '22:55']
 ['VietJet Air' '07:00' '09:10']
 ['VietJet Air' '08:50' '11:00']
 ['VietJet Air' '09:40' '11:50']
 ['Bamboo Airways, VietJet Air' '07:15' '09:55 1d']
 ['Bamboo Airw

Unnamed: 0,Airline,Departure_Time,Arrival_Time,Duration,Departure,Destination,Date,Total_Stops,SClass,Price
0,Vietravel Airlines,20:55,23:10,135,HAN,SGN,19-5-2023,0,ECONOMY,1419800
1,Vietravel Airlines,20:55,23:10,135,HAN,SGN,19-5-2023,0,ECONOMY,1419800
2,VietJet Air,16:30,18:40,130,HAN,SGN,19-5-2023,0,ECONOMY,1441900
3,VietJet Air,17:55,20:05,130,HAN,SGN,19-5-2023,0,ECONOMY,1441900
4,VietJet Air,15:50,18:00,130,HAN,SGN,19-5-2023,0,ECONOMY,1545300
5,VietJet Air,21:00,23:10,130,HAN,SGN,19-5-2023,0,ECONOMY,1545300
7,VietJet Air,19:40,21:50,130,HAN,SGN,19-5-2023,0,ECONOMY,1688000
8,VietJet Air,21:35,23:45,130,HAN,SGN,19-5-2023,0,ECONOMY,1688000
9,VietJet Air,12:30,14:40,130,HAN,SGN,19-5-2023,0,ECONOMY,1873900
10,VietJet Air,13:20,15:30,130,HAN,SGN,19-5-2023,0,ECONOMY,1873900
