# Before you start!

You need a driver to run your web browser automatically. This project is built for Microsoft Windows and Google Chrome, so if you are using another operating system or another browser, you probably need to tweak 1 or 2 things a little bit. 

You can download ChromeDriver here: https://sites.google.com/a/chromium.org/chromedriver/downloads

# When you are ready to start:

## Import packages:

In [217]:
from selenium import webdriver
from datetime import datetime, timedelta
from pathlib import Path
import os
import pandas as pd
from bs4 import BeautifulSoup as bs

## Declare functions:

In [218]:
def ConfirmDepartureTime(time_str):
    if "Farin" in time_str:
        return time_str[-5:]
    return time_str

## Initialize parameters & variables:

In [233]:
current_date = datetime.today().strftime('%Y-%m-%d')
main_directory = os.getcwd() + "\\FlightData"
directory_actual = main_directory + "\\Actual"
directory_plan = main_directory + "\\Plan"

## Initialize directories:

In [220]:
Path(main_directory).mkdir(parents=True, exist_ok=True)
Path(directory_actuals).mkdir(parents=True, exist_ok=True)
Path(directory_plan).mkdir(parents=True, exist_ok=True)

### Choose scenario: Actual Data or Plan Data.
#### IMPORTANT : Set is_actual_scenario to False if you want to fetch Plan Data

#### Actual-scenario: 
Will fetch the whole flight history that is accessible from the Isavia webpage (the last 2 months or so).

#### Plan-scenario: 
Will fetch the whole flight plan that is accessible from the Isavia webpage (the next 6 months or so).

In [240]:
is_actual_scenario = True

if is_actual_scenario:
    day_adder = -1
    target_filename = directory_actuals + f"\\flights_departures_actual_{current_date}.csv"
    actual_filenames = os.listdir(directory_actual)
    if len(actual_filenames) > 0:
        cutoff_date = max([filename[-14: -4] for filename in actual_filenames])
    else:
        cutoff_date = ""
        date_str = current_date
else:
    day_adder = 1
    target_filename = directory_plan + f"\\flights_departures_plan_{current_date}.csv"
    plan_filenames = os.listdir(directory_plan)
    
    if len(plan_filenames) > 0:
        latest_plan_filename = max(plan_filenames)
        date_str = pd.read_csv(directory_plan + "\\" + latest_plan_filename)["Date"].max()
    else:
        date_str = current_date
    


### Find cutoff-date:

In [222]:
# l = ["2021-05-11", "2020-09-01"]
# directory_actuals = os.listdir("FlightData\\actuals")
# directory_plan = os.listdir("FlightData\\plan")
# # cutoff_date = max([filename[-14: -4] for filename in directory_actuals]) if len(directory_actuals) > 0 else "0"
# # cutoff_date = min(l) if len(l) > 0 else "0"
# latest_plan_filename = max([filename for filename in directory_plan]) if len(directory_plan) > 0 else current_date
# date_str = pd.read_csv("FlightData\\plan\\" + latest_plan_filename)["Date"].max()
# test_df




## Say the magic words:

In [241]:
if is_actual_scenario:
    # Just in case...
    date_str = current_date

days_added = 0
Flight_dataset = []
# is_request_returning_data = True;
is_fetching_data = True

while is_fetching_data:
    
    days_added += day_adder
    date_str = (datetime.now() + timedelta(days = days_added)).strftime('%Y-%m-%d')
    
    browser = webdriver.Chrome(os.getcwd() + "\\chromedriver.exe")
    browser.get(f"https://www.isavia.is/keflavikurflugvollur/flugupplysingar/brottfarir?d={date_str}")
    html = browser.page_source
    browser.close()
    
    soup = bs(html, "html.parser")
    dataset = soup.find_all("tr",{"class":"schedule-items-entry"})
    
    print(f"Iteration: {abs(days_added)}, Date: {date_str}, Dataset-length: {len(dataset)}")
    
    for line in dataset:
        
        tmp_data = line.find_all("td")
        
        data_dict =  {
            "CreationDate" : current_date
            ,"Date" : date_str
            ,"Type" : "Departure"
            ,"Scheduled" : tmp_data[0].text
            ,"Destination" : tmp_data[1].text
            ,"FlightNumber" : tmp_data[2].text
            ,"Airline" : tmp_data[3].text
            ,"Confirmed" : ConfirmDepartureTime(tmp_data[4].text)
        }
        
        Flight_dataset.append(data_dict)
        
    if is_actual_scenario:
        is_fetching_data = (date_str > cutoff_date) and (len(dataset) > 0)
    else:
        is_fetching_data = len(dataset) > 0
    
print("Data sampling completed!")

df = pd.DataFrame(Flight_dataset)
df = df_actual.assign(Scenario = "Actual") if is_actual_scenario else df_actual.assign(Scenario = "Plan")
df.to_csv(target_filename, index=False)

print("Data saved in a file!")

Iteration: 1, Date: 2021-09-06, Dataset-length: 54
Iteration: 2, Date: 2021-09-05, Dataset-length: 52
Data sampling completed!
Data saved in a file!
