# Before you start!

You need a driver to run your web browser automatically. This project is built for Microsoft Windows and Google Chrome, so if you are using another operating system or another browser, you probably need to tweak 1 or 2 things a little bit. 

You can download ChromeDriver here: https://sites.google.com/a/chromium.org/chromedriver/downloads

# When you are ready to start:

## Import packages:

In [2]:
from selenium import webdriver
from datetime import datetime, timedelta
from pathlib import Path
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs

## Declare functions:

In [3]:
def ConfirmDepartureTime(time_str):
    if "Farin" in time_str:
        return time_str[-5:]
    return time_str

def IsFilenameAlreadyExisting(filename):
    data_files_existing = os.listdir(directory_actual) + os.listdir(directory_plan)
    if filename in data_files_existing:
        return True
    return False

    
def CalculatePunctuality(schedule, confirmed):
    schedule_mins = schedule.str[:2].astype('int32') * 60 + schedule.str[-2:].astype('int32')
    confirmed_mins = confirmed.str[:2].astype('int32') * 60 + confirmed.str[-2:].astype('int32')
    return confirmed_mins - schedule_mins

def IsMorning(scheduled_time):
    time = scheduled_time.str[:2]
    is_morning = True if time.astype(str) < "12" else False
    return is_morning


## Initialize parameters & variables:

In [5]:
# current_date = datetime.today().strftime('%Y-%m-%d')
# main_directory = os.getcwd() + "\\FlightData"
# directory_actual = main_directory + "\\Actual"
# directory_plan = main_directory + "\\Plan"

## Initialize directories:

In [4]:
directory_actual = os.path.join('FlightData', 'Actual')
directory_plan = os.path.join('FlightData', 'Plan')

Path('FlightData').mkdir(parents=True, exist_ok=True)
Path(directory_actual).mkdir(parents=True, exist_ok=True)
Path(directory_plan).mkdir(parents=True, exist_ok=True)

### Choose scenario: Actual Data or Plan Data.
**IMPORTANT :** Set **is_actual_scenario** to **False** if you want to fetch Plan Data

#### Actual-scenario: 
Will fetch the whole flight history that is accessible from the Isavia webpage (the last 2 months or so).

#### Plan-scenario: 
Will fetch the whole flight plan that is accessible from the Isavia webpage (the next 6 months or so).

In [5]:
current_date = datetime.today().strftime('%Y-%m-%d')

is_actual_scenario = True

if is_actual_scenario:
    day_adder = -1
    target_filename = f"flights_departures_actual_{current_date}.csv"
    target_filename_path = os.path.join(directory_actual, target_filename)
    actual_filenames = os.listdir(directory_actual)
    if len(actual_filenames) > 0:
        cutoff_date = max(actual_filenames)[-14:-4]
    else:
        cutoff_date = ""
    date_str = current_date
    
else:
    day_adder = 1
    target_filename = f"flights_departures_plan_{current_date}.csv"
    target_filename_path = os.path.join(directory_plan, target_filename)
    plan_filenames = os.listdir(directory_plan)
    
#     if len(plan_filenames) > 0:
#         latest_plan_filename = max(plan_filenames)
#         date_str = pd.read_csv(os.path.join(directory_plan, latest_plan_filename))["Date"].max()
#     else:
    date_str = current_date
    cutoff_date = ""


## Say the magic words:

In [6]:
days_added = 0
Flight_dataset = []
is_fetching_data = cutoff_date < date_str
iteration_counter = 0

if not IsFilenameAlreadyExisting(target_filename):

    while is_fetching_data:
        
        iteration_counter += 1
        date_str = (datetime.strptime(date_str, "%Y-%m-%d") + timedelta(days = day_adder)).strftime('%Y-%m-%d')
        
        browser = webdriver.Chrome(os.path.join(os.getcwd(), "chromedriver.exe"))
        browser.get(f"https://www.isavia.is/keflavikurflugvollur/flugupplysingar/brottfarir?d={date_str}")
        html = browser.page_source
        browser.close()

        soup = bs(html, "html.parser")
        dataset = soup.find_all("tr",{"class":"schedule-items-entry"})

        print(f"Iteration: {iteration_counter}, Date: {date_str}, Dataset-length: {len(dataset)}")

        for line in dataset:

            tmp_data = line.find_all("td")

            data_dict =  {
                "CreationDate" : current_date
                ,"Date" : date_str
                ,"Type" : "Departure"
                ,"Scheduled" : tmp_data[0].text
                ,"Destination" : tmp_data[1].text
                ,"FlightNumber" : tmp_data[2].text
                ,"Airline" : tmp_data[3].text
                ,"Confirmed" : ConfirmDepartureTime(tmp_data[4].text)
            }
            
            Flight_dataset.append(data_dict)

        if is_actual_scenario:
            is_fetching_data = (date_str > cutoff_date) and (len(dataset) > 0)
        else:
            is_fetching_data = len(dataset) > 0  and iteration_counter < 90

    print("Data sampling completed!")
    
    if len(Flight_dataset) > 0:
        df = pd.DataFrame(Flight_dataset)
        df = df.assign(Scenario = "Actual") if is_actual_scenario else df.assign(Scenario = "Plan")
        df.to_csv(target_filename_path, index=False)

        print("Data saved in a file!")
        
    else:
        print("Our database was already up-to-date.")

else:
    print(f"File already existing: {target_filename}")

Iteration: 1, Date: 2021-10-23, Dataset-length: 43
Iteration: 2, Date: 2021-10-22, Dataset-length: 39
Iteration: 3, Date: 2021-10-21, Dataset-length: 43
Iteration: 4, Date: 2021-10-20, Dataset-length: 22
Iteration: 5, Date: 2021-10-19, Dataset-length: 27
Iteration: 6, Date: 2021-10-18, Dataset-length: 43
Iteration: 7, Date: 2021-10-17, Dataset-length: 41
Iteration: 8, Date: 2021-10-16, Dataset-length: 41
Iteration: 9, Date: 2021-10-15, Dataset-length: 39
Iteration: 10, Date: 2021-10-14, Dataset-length: 46
Iteration: 11, Date: 2021-10-13, Dataset-length: 25
Iteration: 12, Date: 2021-10-12, Dataset-length: 30
Iteration: 13, Date: 2021-10-11, Dataset-length: 44
Data sampling completed!
Data saved in a file!
