In [1]:
import os
import logging
import datetime
import pandas as pd

from tqdm import *
from pyflightdata import FlightData
from pyflightdata.common_fr24 import *

In [2]:
START_DATE = "20210101"
END_DATE = "20210430"
use_country = True
countries = ["Canada"]

ERROR_LOG_FILE_NAME = 'logs/error.log'
DEBUG_LOG_FILE_NAME = 'logs/debug.log'
INFO_LOG_FILE_NAME = 'logs/info.log'

COUNTRIES_FILE_NAME = "datasets/countries.txt"
GRAPH_ATTRIBUTES_FILE_NAME = "datasets/graph_attributes.txt "
EDGE_LIST_FILE_PREFIX = "dataset/"

ADDITIONAL_AIRPORTS_FILE_NAME = "datasets/additional_airports.txt"

AIRPORTS_FILE_PREFIX = "datasets/airports/node_attributes"
UNIQUE_FLIGHTS_FILE_PREFIX = "datasets/unique_flights/unique_flights"
UNIQUE_FLIGHTS_LOG_FLIE_NAME = UNIQUE_FLIGHTS_FILE_PREFIX + ".log"
HISTORIC_FLIGHTS_FILE_PREFIX = "datasets/historic_flights/historic_flights"

AIRPORT_ATTRIBUTE_NAMES = ["iata", "latitude", "longitude", "country", "city"]
FLIGHTS_ATTRIBUTE_NAMES = ["flight_number", "status", "departure_airport", "arrival_airport", "departure_date", "duration"]

In [3]:
class InfoFilter(logging.Filter):
    def filter(self, rec):
        return rec.levelno == logging.INFO

def get_logger(info_log_file, error_log_file_name):
    log = logging.getLogger('default_logger')
    # set console to info
    console_handler = logging.StreamHandler()
    console_handler.addFilter(InfoFilter())
    log.addHandler(console_handler)
    # set info log to info
    file_handler_info = logging.FileHandler(info_log_file, mode='w')
    file_handler_info.addFilter(InfoFilter())
    log.addHandler(file_handler_info)
    # set error log to warn
    file_handler_error = logging.FileHandler(error_log_file_name, mode='w')
    file_handler_error.setLevel(logging.WARNING)
    log.addHandler(file_handler_error)
    log.setLevel(logging.DEBUG)
    return log

In [4]:
class CustomFR24(FR24):

    def __init__(self):
        super(CustomFR24, self).__init__()

    def get_airport_departures_total_page(self, url):
        while True:
            try:
                return self.get_raw_data_json(url, 'result.response.airport.pluginData.schedule.departures.page.total')[0]
            except:
                pass

    def get_airport_arrivals_total_page(self, url):
        while True:
            try:
                return self.get_raw_data_json(url, 'result.response.airport.pluginData.schedule.arrivals.page.total')[0]
            except:
                pass

    def get_data(self, url, by_tail=False):
        response = self.get_raw_data_json(url, 'result.response')
        response = self.filter_and_get_data(response) or []
        return response

In [5]:
class CustomFlightData(FlightData):

    _fr24 = CustomFR24()
    _FLT_BASE = 'https://api.flightradar24.com/common/v1/flight/list.json?query={0}&fetchBy=flight&filterBy=historic&page={2}&limit={3}&token={1}&timestamp={4}'

    def __init__(self, email=None, password=None):
        super(CustomFlightData, self).__init__(email, password)

    def get_airport_details(self, iata, page=1, limit=100):
        url = AIRPORT_DATA_BASE.format(iata, str(self.AUTH_TOKEN), page, limit)
        details = self._fr24.get_airport_details(url)
        return details

    def get_airport_departures_total_page(self, iata, page=1, limit=100):
        date = datetime.datetime.now()
        url = AIRPORT_DATA_BASE_EARLIER.format(iata, str(self.AUTH_TOKEN), page, limit, int(date.timestamp()), 0)
        return self._fr24.get_airport_departures_total_page(url)

    def get_airport_arrivals_total_page(self, iata, page=1, limit=100):
        date = datetime.datetime.now()
        url = AIRPORT_DATA_BASE_EARLIER.format(iata, str(self.AUTH_TOKEN), page, limit, int(date.timestamp()), 0)
        return self._fr24.get_airport_arrivals_total_page(url)

    def get_airport_departures(self, iata, page=1, limit=100, earlier_data=False):
        date = datetime.datetime.now()
        url = AIRPORT_DATA_BASE_EARLIER.format(iata, str(self.AUTH_TOKEN), page, limit, int(date.timestamp()), 0)
        return self._fr24.get_airport_departures(url)

    def get_airport_arrivals(self, iata, page=1, limit=100, earlier_data=False):
        date = datetime.datetime.now()
        url = AIRPORT_DATA_BASE_EARLIER.format(iata, str(self.AUTH_TOKEN), page, limit, int(date.timestamp()), 0)
        return self._fr24.get_airport_arrivals(url)

    def get_historical_flight_number(self, flight_number, last_updated, page=1, limit=100):
        url = self._FLT_BASE.format(flight_number, str(self.AUTH_TOKEN), page, limit, last_updated)
        return self._fr24.get_data(url)

In [6]:
def write_countries():
    logging.basicConfig(filename=ERROR_LOG_FILE_NAME, level=logging.WARNING)
    f = CustomFlightData()
    countries = [country['country'] for country in tqdm(f.get_countries())]
    df = pd.DataFrame(countries)
    df.to_csv(COUNTRIES_FILE_NAME, header=False, index=False)

if (not os.path.exists(COUNTRIES_FILE_NAME[:COUNTRIES_FILE_NAME.rfind("/")])):
    os.makedirs(COUNTRIES_FILE_NAME[:COUNTRIES_FILE_NAME.rfind("/")])
    os.makedirs(ERROR_LOG_FILE_NAME[:ERROR_LOG_FILE_NAME.rfind("/")])
    write_countries()

In [7]:
def to_airport(airport_details):
    try:
        airport_dict = {
            'iata': airport_details['code']['iata'],
            'latitude': airport_details['position']['latitude'],
            'longitude': airport_details['position']['longitude'],
            'country': airport_details['position']['country']['name'],
            'city': airport_details['position']['region']['city']
        }
        return airport_dict
    except (KeyError, TypeError):
        logging.error("Error parsing airport %s", airport_details)
        return None


def to_flight_number(departure):
    try:
        flight_number = departure['flight']['identification']['number']['default']
        return flight_number
    except (KeyError, TypeError):
        logging.error("Error parsing flight_number %s", departure)
        return None


def to_has_more(history_by_flight_number):
    try:
        has_more = history_by_flight_number['page']['more']
        return has_more
    except (KeyError, TypeError):
        logging.error("Error parsing has_more %s", history_by_flight_number)
        return False


def get_departure_date(flight):
    if flight['time']['real']["departure"] != 'None':
        date = flight['time']['real']['departure_date']
    else:
        date = flight['time']['scheduled']['departure_date']
    datetime.datetime.strptime(date, '%Y%m%d')
    return date


def to_flight(flight):
    try:
        flight_dict = {
            'flight_number': flight['identification']['number']['default'],
            'status': flight['status']['generic']['status']['text'],
            'departure_airport': flight['airport']['origin']['code']['iata'],
            'arrival_airport': flight['airport']['destination']['code']['iata'],
            'departure_date': get_departure_date(flight),
            'time_updated': flight['time']['other']['updated'],
            "duration": flight['time']['other']['duration']
        }
        return flight_dict
    except (KeyError, TypeError, ValueError):
        logging.error("Error parsing flight %s", flight)
        return None


def get_airport_by_iata(f, iata):
    airport_details = f.get_airport_details(iata)
    return to_airport(airport_details)


def fetch_airports_by_country(f, airports_file_prefix, country):
    filename = "%s-%s.txt" % (airports_file_prefix, country)
    if not os.path.exists(filename):
        airports = []
        for airport in f.get_airports(country):
            if airport['iata']:
                airport_dict = get_airport_by_iata(f, airport['iata'])
                if airport_dict is not None:
                    airports.append(airport_dict)
            else:
                logging.warning("IATA not found for airport: %s", airport)
        df = pd.DataFrame(airports)
        df.to_csv(filename, header=False)
    else:
        logging.warning("airports file %s already exists: " % filename)


def fetch_airports(countries,
                   airports_file_prefix=AIRPORTS_FILE_PREFIX,
                   error_log_file_name=ERROR_LOG_FILE_NAME):
    logging.basicConfig(filename=error_log_file_name, level=logging.WARNING)
    f = CustomFlightData()
    #f.login("username", "password")
    
    if (not os.path.exists(airports_file_prefix[:airports_file_prefix.rfind("/")])):
        os.makedirs(airports_file_prefix[:airports_file_prefix.rfind("/")])
        if not countries:
            countries = [country['country'] for country in f.get_countries()]
        for country in tqdm(countries):
            fetch_airports_by_country(f, airports_file_prefix, country)
    f.logout()


fetch_airports([], AIRPORTS_FILE_PREFIX, ERROR_LOG_FILE_NAME)

In [8]:
def get_unique_flights_logger(unique_flights_log_file_name):
    unique_flights_logger = logging.getLogger('unique_flights_logger')
    handler = logging.FileHandler(unique_flights_log_file_name, mode='a')
    handler.addFilter(InfoFilter())
    unique_flights_logger.addHandler(handler)
    unique_flights_logger.setLevel(logging.INFO)
    return unique_flights_logger


def fetch_unique_flights(countries,
                         unique_flights_file_prefix=UNIQUE_FLIGHTS_FILE_PREFIX,
                         airports_file_prefix=AIRPORTS_FILE_PREFIX):
    f = CustomFlightData()
    #f.login("username", "password")
    if not countries:
        default_logger.info("fetching countries")
        countries = [country['country'] for country in f.get_countries()]
    for country in countries:
        fetch_unique_flights_by_country(f, airports_file_prefix, unique_flights_file_prefix, country)
    f.logout()


def fetch_unique_flights_by_country(f, airports_file_prefix, unique_flights_file_prefix, country):
    default_logger.info("fetching the unique flights of: %s" % country)
    airports_file_name = "%s-%s.txt" % (airports_file_prefix, country)
    airports_file_name = airports_file_name
    unique_flights_filename = "%s-%s.txt" % (unique_flights_file_prefix, country)
    if os.path.exists(airports_file_name):
        default_logger.info("loading airports")
        airports = pd.read_csv(airports_file_name, names=AIRPORT_ATTRIBUTE_NAMES)
        default_logger.info("total airports: %d" % len(airports))
        new_flights = get_new_flights(f, airports)
        existing_flights_df = get_existing_flights(unique_flights_filename)
        flights_added, total_flights = append_new_flights(existing_flights_df, new_flights, unique_flights_filename)
        unique_flights_logger.info("%s, %s, %d, %d" % (pd.to_datetime('today'), country, flights_added, total_flights))
    else:
        default_logger.warning("airports file %s does not exist: " % airports_file_name)


def get_new_flights(f, airports):
    num_airports = len(airports)
    new_flights = set()
    for airport_idx, airport in enumerate(tqdm(airports['iata'])):
        fetch_unique_flights_by_airport(f, airport, new_flights)
    return new_flights


def fetch_unique_flights_by_airport(f, airport, new_flights):
    total_page = f.get_airport_departures_total_page(airport)
    for page in range(1, total_page + 1):
        for flight in f.get_airport_departures(airport, page=page, earlier_data=True):
            append_flight(flight, new_flights)

    total_page = f.get_airport_arrivals_total_page(airport)
    for page in range(1, total_page + 1):
        for flight in f.get_airport_arrivals(airport, page=page, earlier_data=True):
            append_flight(flight, new_flights)


def append_flight(flight, new_flights):
    flight_number = to_flight_number(flight)
    if flight_number is not None:
        new_flights.add(flight_number)


def get_existing_flights(unique_flights_filename):
    existing_flights_df = pd.DataFrame(columns=['flight', 'added'])
    if os.path.exists(unique_flights_filename):
        existing_flights_df = pd.read_csv(unique_flights_filename, names=['flight', 'added'])
        existing_flights_df.added = existing_flights_df.added.fillna(pd.to_datetime('today'))
    return existing_flights_df


def append_new_flights(existing_flights_df, new_flights, unique_flights_filename):
    default_logger.info("total new flights: %d" % len(new_flights))
    existing_flights = set(existing_flights_df['flight'].tolist())
    default_logger.info("total existing flights: %d" % len(existing_flights))
    flights_to_add = new_flights.difference(existing_flights)
    default_logger.info("total unique new flights: %d" % len(flights_to_add))
    flights_to_add_df = pd.DataFrame(flights_to_add, columns=['flight'])
    flights_to_add_df['added'] = pd.to_datetime('today')
    unique_flights_df = existing_flights_df.append(flights_to_add_df, ignore_index=True)
    unique_flights_df.to_csv(unique_flights_filename, header=False, index=False)
    return len(flights_to_add), len(unique_flights_df.index)

if (not os.path.exists(UNIQUE_FLIGHTS_LOG_FLIE_NAME[:UNIQUE_FLIGHTS_LOG_FLIE_NAME.rfind("/")])):
    os.makedirs(UNIQUE_FLIGHTS_LOG_FLIE_NAME[:UNIQUE_FLIGHTS_LOG_FLIE_NAME.rfind("/")])
    
    default_logger = get_logger(INFO_LOG_FILE_NAME, ERROR_LOG_FILE_NAME)
    unique_flights_logger = get_unique_flights_logger(UNIQUE_FLIGHTS_LOG_FLIE_NAME)
    fetch_unique_flights(countries, UNIQUE_FLIGHTS_FILE_PREFIX, AIRPORTS_FILE_PREFIX)

fetching the unique flights of: Canada
loading airports
total airports: 246
  4%|▎         | 9/246 [00:23<07:52,  2.00s/it]

HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...


 14%|█▍        | 35/246 [02:35<05:21,  1.53s/it]  

HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...


 30%|██▉       | 73/246 [05:02<05:39,  1.96s/it]  

HTML code 520 - Retry in 10 seconds...


 39%|███▉      | 97/246 [06:29<03:35,  1.45s/it]

HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...
HTML code 429 - Retry in 10 seconds...


100%|██████████| 246/246 [15:18<00:00,  3.73s/it]
total new flights: 1466
total existing flights: 0
total unique new flights: 1466


In [9]:
def update_count_by_flight_status(flight, count_by_flight_status):
    status = flight['status']
    if status not in count_by_flight_status.keys():
        count_by_flight_status[status] = 0
    count_by_flight_status[status] += 1


def append_historic_flights(flight, historic_flights):
    if flight is not None:
        historic_flights.append(flight)


def is_departure_before_date(date, departure_date):
    date = datetime.datetime.strptime(date, '%Y%m%d')
    departure_date = datetime.datetime.strptime(departure_date, '%Y%m%d')
    return departure_date <= date


def is_departure_after_date(date, departure_date):
    date = datetime.datetime.strptime(date, '%Y%m%d')
    departure_date = datetime.datetime.strptime(departure_date, '%Y%m%d')
    return departure_date > date


def fetch_flight_history_by_flight_number(f, flight_number, date_start, count_by_flight_status):
    historic_flights = []
    has_more = True
    page = 1
    time_updated = int(pd.to_datetime('today').timestamp())
    while has_more:
        response = f.get_historical_flight_number(flight_number, time_updated, page=page)
        if 'data' not in response:
            default_logger.warning("data not in response for flight number %s, page %d" % (flight_number, page))
            break
        has_more = to_has_more(response)
        page += 1
        for flight_response in response['data']:
            flight = to_flight(flight_response)
            if flight is not None:
                time_updated = flight['time_updated']
                departure_date = flight['departure_date']
                if is_departure_before_date(date_end, departure_date) and is_departure_after_date(date_start,
                                                                                                  departure_date):
                    historic_flights.append(flight)
                    update_count_by_flight_status(flight, count_by_flight_status)
                if is_departure_before_date(date_start, departure_date):
                    # default_logger.info("\t departure date: %s is before date start" % departure_date)
                    has_more = False
                    break
        if page > 10:
            break
    # default_logger.info("\t current count by flight status: %s" % count_by_flight_status)
    return historic_flights


def log_summary(flights_summary_logger, flight_number, new_flights):
    if not new_flights:
        return
    new_flights_df = pd.DataFrame(new_flights)
    groups = new_flights_df.groupby(['departure_airport', 'arrival_airport'])
    for keys, frame in groups:
        departure_dates = frame['departure_date'].to_list()
        flights_summary_logger.info("%s,%s,%s,%s,%d,%s,%s" % (
            pd.to_datetime('today'), flight_number, keys[0], keys[1], len(frame.index), departure_dates[-1],
            departure_dates[0]))


def get_start_date_by_flight_number(existing_summary, flight_number):
    if existing_summary is not None:
        summary = existing_summary[existing_summary[1] == flight_number]
        if not summary.empty:
            return str(sorted(summary[6].to_list(), reverse=True)[0])
    return START_DATE


def fetch_flight_history_by_country(f, unique_flights_file_prefix, historic_flights_file_prefix, country):
    default_logger.info("fetching flight history for country: %s" % country)
    unique_flights_filename = "%s-%s.txt" % (unique_flights_file_prefix, country)
    historic_flights_file_name = "%s-%s.txt" % (historic_flights_file_prefix, country)
    summary_file_name = "%s_summary-%s.txt" % (historic_flights_file_prefix, country)
    existing_summary = pd.read_csv(summary_file_name, usecols=[1, 7], header=None) if os.path.exists(
        summary_file_name) else None
    flights_summary_logger = get_flights_summary_logger(summary_file_name, country)
    if os.path.exists(unique_flights_filename):
        try:
            unique_flights = pd.read_csv(unique_flights_filename).iloc[:, 0].to_list()
            historic_flights = []
            count_by_flight_status = {}
            for f_idx, flight_number in enumerate(tqdm(unique_flights)):
                date_start = get_start_date_by_flight_number(existing_summary, flight_number)
                new_flights = fetch_flight_history_by_flight_number(f, flight_number, date_start,
                                                                    count_by_flight_status)
                new_df = pd.DataFrame(new_flights, columns=FLIGHTS_ATTRIBUTE_NAMES)
                new_df.to_csv(historic_flights_file_name, mode='a', header=False, index=False)
                log_summary(flights_summary_logger, flight_number, new_flights)
                historic_flights.extend(new_flights)
        except EmptyDataError:
            default_logger.exception("file %s is empty" % unique_flights_filename)
    else:
        default_logger.warning("unique flights file %s does not exist: " % unique_flights_filename)


def get_flights_summary_logger(file_name, country):
    summary_logger = logging.getLogger('summary_logger_' + country)
    handler = logging.FileHandler(file_name, mode='a')
    handler.addFilter(InfoFilter())
    summary_logger.addHandler(handler)
    summary_logger.setLevel(logging.INFO)
    return summary_logger


def fetch_flight_history(countries,
                         unique_flights_file_prefix=UNIQUE_FLIGHTS_FILE_PREFIX,
                         historic_flights_file_prefix=HISTORIC_FLIGHTS_FILE_PREFIX):
    f = CustomFlightData()
    #f.login("username", "password")
    if not countries:
        default_logger.info("fetching countries")
        countries = [country['country'] for country in f.get_countries()]
    for country in countries:
        fetch_flight_history_by_country(f, unique_flights_file_prefix, historic_flights_file_prefix, country)
    f.logout()
    
    
if (not os.path.exists(HISTORIC_FLIGHTS_FILE_PREFIX[:HISTORIC_FLIGHTS_FILE_PREFIX.rfind("/")])):
    os.makedirs(HISTORIC_FLIGHTS_FILE_PREFIX[:HISTORIC_FLIGHTS_FILE_PREFIX.rfind("/")])
    
    default_logger = get_logger('logs/h_info.log', 'logs/h_error.log')
    date_end = END_DATE
    fetch_flight_history(countries, UNIQUE_FLIGHTS_FILE_PREFIX, HISTORIC_FLIGHTS_FILE_PREFIX)

fetching flight history for country: Canada
fetching flight history for country: Canada
  1%|          | 8/1465 [00:11<39:39,  1.63s/it]

HTML code 402 - Retry in 10 seconds...


  1%|          | 15/1465 [00:32<38:05,  1.58s/it]  

HTML code 402 - Retry in 10 seconds...


  2%|▏         | 33/1465 [01:09<29:50,  1.25s/it]  

HTML code 402 - Retry in 10 seconds...


  3%|▎         | 47/1465 [01:37<27:19,  1.16s/it]  

HTML code 402 - Retry in 10 seconds...


  4%|▍         | 56/1465 [01:59<33:54,  1.44s/it]  

HTML code 402 - Retry in 10 seconds...


  5%|▍         | 66/1465 [02:27<29:47,  1.28s/it]  

HTML code 402 - Retry in 10 seconds...


  5%|▌         | 76/1465 [02:53<32:53,  1.42s/it]  

HTML code 402 - Retry in 10 seconds...


  6%|▌         | 88/1465 [03:22<30:13,  1.32s/it]  

HTML code 402 - Retry in 10 seconds...


  6%|▋         | 95/1465 [03:46<42:36,  1.87s/it]  

HTML code 402 - Retry in 10 seconds...


  8%|▊         | 110/1465 [04:19<24:47,  1.10s/it]  

HTML code 402 - Retry in 10 seconds...


  8%|▊         | 124/1465 [04:46<20:43,  1.08it/s]  

HTML code 402 - Retry in 10 seconds...


  9%|▉         | 132/1465 [05:09<38:07,  1.72s/it]  

HTML code 402 - Retry in 10 seconds...


 11%|█         | 155/1465 [05:44<29:50,  1.37s/it]  

HTML code 402 - Retry in 10 seconds...


 13%|█▎        | 186/1465 [06:33<20:57,  1.02it/s]  

HTML code 402 - Retry in 10 seconds...


 14%|█▎        | 200/1465 [07:05<32:26,  1.54s/it]  

HTML code 402 - Retry in 10 seconds...


 14%|█▍        | 209/1465 [07:27<23:53,  1.14s/it]  

HTML code 402 - Retry in 10 seconds...


 15%|█▍        | 216/1465 [07:47<39:53,  1.92s/it]  

HTML code 402 - Retry in 10 seconds...


 16%|█▌        | 233/1465 [08:17<17:41,  1.16it/s]  

HTML code 402 - Retry in 10 seconds...


 17%|█▋        | 251/1465 [08:51<14:20,  1.41it/s]  

HTML code 402 - Retry in 10 seconds...


 19%|█▉        | 281/1465 [09:50<33:12,  1.68s/it]  

HTML code 402 - Retry in 10 seconds...


 20%|█▉        | 287/1465 [10:16<51:27,  2.62s/it]  

HTML code 402 - Retry in 10 seconds...


 21%|██        | 301/1465 [10:46<21:58,  1.13s/it]  

HTML code 402 - Retry in 10 seconds...


 21%|██▏       | 314/1465 [11:14<29:12,  1.52s/it]  

HTML code 402 - Retry in 10 seconds...


 23%|██▎       | 336/1465 [12:00<22:56,  1.22s/it]  

HTML code 402 - Retry in 10 seconds...


 24%|██▍       | 356/1465 [12:39<27:56,  1.51s/it]  

HTML code 402 - Retry in 10 seconds...


 26%|██▌       | 374/1465 [13:09<17:38,  1.03it/s]  

HTML code 402 - Retry in 10 seconds...


 26%|██▌       | 382/1465 [13:28<23:38,  1.31s/it]  

HTML code 402 - Retry in 10 seconds...


 27%|██▋       | 393/1465 [13:55<23:49,  1.33s/it]  

HTML code 402 - Retry in 10 seconds...


 29%|██▊       | 418/1465 [14:53<26:36,  1.53s/it]  

HTML code 402 - Retry in 10 seconds...


 29%|██▉       | 425/1465 [15:14<31:47,  1.83s/it]  

HTML code 402 - Retry in 10 seconds...


 30%|██▉       | 436/1465 [15:42<19:37,  1.14s/it]  

HTML code 402 - Retry in 10 seconds...


 30%|███       | 444/1465 [16:03<30:13,  1.78s/it]  

HTML code 402 - Retry in 10 seconds...


 31%|███▏      | 458/1465 [16:34<18:39,  1.11s/it]  

HTML code 402 - Retry in 10 seconds...


 32%|███▏      | 472/1465 [17:00<15:11,  1.09it/s]  

HTML code 402 - Retry in 10 seconds...


 33%|███▎      | 490/1465 [17:42<36:22,  2.24s/it]  

HTML code 402 - Retry in 10 seconds...


 35%|███▍      | 509/1465 [18:29<18:32,  1.16s/it]  

HTML code 402 - Retry in 10 seconds...


 36%|███▌      | 528/1465 [18:58<16:53,  1.08s/it]  

HTML code 402 - Retry in 10 seconds...


 38%|███▊      | 563/1465 [20:08<18:15,  1.21s/it]  

HTML code 402 - Retry in 10 seconds...


 39%|███▉      | 570/1465 [20:28<22:59,  1.54s/it]  

HTML code 402 - Retry in 10 seconds...


 40%|███▉      | 579/1465 [20:49<22:06,  1.50s/it]  

HTML code 402 - Retry in 10 seconds...


 41%|████      | 594/1465 [21:23<22:32,  1.55s/it]  

HTML code 402 - Retry in 10 seconds...


 41%|████▏     | 607/1465 [21:51<15:32,  1.09s/it]  

HTML code 402 - Retry in 10 seconds...


 42%|████▏     | 613/1465 [22:10<30:55,  2.18s/it]  

HTML code 402 - Retry in 10 seconds...


 43%|████▎     | 635/1465 [22:50<19:00,  1.37s/it]  

HTML code 402 - Retry in 10 seconds...


 44%|████▍     | 651/1465 [23:21<15:17,  1.13s/it]  

HTML code 402 - Retry in 10 seconds...


 45%|████▌     | 661/1465 [23:50<25:28,  1.90s/it]  

HTML code 402 - Retry in 10 seconds...


 46%|████▌     | 673/1465 [24:16<16:02,  1.22s/it]  

HTML code 402 - Retry in 10 seconds...


 46%|████▌     | 676/1465 [24:34<47:16,  3.60s/it]  

HTML code 402 - Retry in 10 seconds...


 47%|████▋     | 689/1465 [25:02<13:49,  1.07s/it]  

HTML code 402 - Retry in 10 seconds...


 48%|████▊     | 697/1465 [25:26<22:38,  1.77s/it]

HTML code 402 - Retry in 10 seconds...


 48%|████▊     | 708/1465 [25:50<20:26,  1.62s/it]  

HTML code 402 - Retry in 10 seconds...


 49%|████▉     | 717/1465 [26:10<19:34,  1.57s/it]

HTML code 402 - Retry in 10 seconds...


 50%|████▉     | 730/1465 [26:41<16:48,  1.37s/it]

HTML code 402 - Retry in 10 seconds...


 51%|█████     | 741/1465 [27:06<13:24,  1.11s/it]

HTML code 402 - Retry in 10 seconds...


 52%|█████▏    | 759/1465 [27:38<10:51,  1.08it/s]

HTML code 402 - Retry in 10 seconds...


 53%|█████▎    | 783/1465 [28:28<12:50,  1.13s/it]

HTML code 402 - Retry in 10 seconds...


 54%|█████▍    | 795/1465 [28:52<10:34,  1.06it/s]

HTML code 402 - Retry in 10 seconds...


 55%|█████▍    | 800/1465 [29:10<24:14,  2.19s/it]

HTML code 402 - Retry in 10 seconds...


 55%|█████▍    | 804/1465 [29:27<31:14,  2.84s/it]

HTML code 402 - Retry in 10 seconds...


 55%|█████▌    | 810/1465 [29:46<21:13,  1.94s/it]  

HTML code 402 - Retry in 10 seconds...


 56%|█████▌    | 821/1465 [30:14<20:39,  1.92s/it]

HTML code 402 - Retry in 10 seconds...


 57%|█████▋    | 828/1465 [30:43<25:52,  2.44s/it]

HTML code 402 - Retry in 10 seconds...


 57%|█████▋    | 841/1465 [31:14<17:05,  1.64s/it]

HTML code 402 - Retry in 10 seconds...


 59%|█████▉    | 866/1465 [32:02<14:50,  1.49s/it]

HTML code 402 - Retry in 10 seconds...


 60%|██████    | 884/1465 [32:33<07:41,  1.26it/s]

HTML code 402 - Retry in 10 seconds...


 61%|██████    | 893/1465 [33:03<22:28,  2.36s/it]

HTML code 402 - Retry in 10 seconds...


 62%|██████▏   | 906/1465 [33:29<09:32,  1.02s/it]

HTML code 402 - Retry in 10 seconds...


 63%|██████▎   | 920/1465 [33:59<10:12,  1.12s/it]

HTML code 402 - Retry in 10 seconds...


 63%|██████▎   | 930/1465 [34:21<09:26,  1.06s/it]

HTML code 402 - Retry in 10 seconds...


 64%|██████▍   | 942/1465 [34:46<09:46,  1.12s/it]

HTML code 402 - Retry in 10 seconds...


 66%|██████▌   | 962/1465 [35:26<11:10,  1.33s/it]

HTML code 402 - Retry in 10 seconds...


 67%|██████▋   | 978/1465 [35:58<09:17,  1.14s/it]

HTML code 402 - Retry in 10 seconds...


 68%|██████▊   | 991/1465 [36:24<10:28,  1.33s/it]

HTML code 402 - Retry in 10 seconds...


 69%|██████▉   | 1013/1465 [37:04<09:33,  1.27s/it]

HTML code 402 - Retry in 10 seconds...


 70%|███████   | 1032/1465 [37:42<09:06,  1.26s/it]

HTML code 402 - Retry in 10 seconds...


 71%|███████   | 1041/1465 [38:06<10:34,  1.50s/it]

HTML code 402 - Retry in 10 seconds...


 72%|███████▏  | 1053/1465 [38:41<08:34,  1.25s/it]

HTML code 402 - Retry in 10 seconds...


 72%|███████▏  | 1058/1465 [38:58<15:28,  2.28s/it]

HTML code 402 - Retry in 10 seconds...


 73%|███████▎  | 1073/1465 [39:33<06:20,  1.03it/s]

HTML code 402 - Retry in 10 seconds...


 74%|███████▍  | 1084/1465 [40:01<07:39,  1.21s/it]

HTML code 402 - Retry in 10 seconds...


 74%|███████▍  | 1091/1465 [40:24<11:55,  1.91s/it]

HTML code 402 - Retry in 10 seconds...


 75%|███████▌  | 1106/1465 [40:55<05:32,  1.08it/s]

HTML code 402 - Retry in 10 seconds...


 77%|███████▋  | 1126/1465 [41:31<07:30,  1.33s/it]

HTML code 402 - Retry in 10 seconds...


 77%|███████▋  | 1134/1465 [41:50<07:41,  1.39s/it]

HTML code 402 - Retry in 10 seconds...


 79%|███████▉  | 1154/1465 [42:34<06:34,  1.27s/it]

HTML code 402 - Retry in 10 seconds...


 80%|███████▉  | 1167/1465 [43:03<07:15,  1.46s/it]

HTML code 402 - Retry in 10 seconds...


 81%|████████  | 1182/1465 [43:37<05:26,  1.16s/it]

HTML code 402 - Retry in 10 seconds...


 82%|████████▏ | 1205/1465 [44:18<07:10,  1.66s/it]

HTML code 402 - Retry in 10 seconds...


 83%|████████▎ | 1211/1465 [44:36<07:24,  1.75s/it]

HTML code 402 - Retry in 10 seconds...


 86%|████████▌ | 1258/1465 [45:52<04:41,  1.36s/it]

HTML code 402 - Retry in 10 seconds...


 86%|████████▋ | 1267/1465 [46:16<05:23,  1.64s/it]

HTML code 402 - Retry in 10 seconds...


 87%|████████▋ | 1276/1465 [46:42<05:48,  1.84s/it]

HTML code 402 - Retry in 10 seconds...


 88%|████████▊ | 1287/1465 [47:07<04:13,  1.42s/it]

HTML code 402 - Retry in 10 seconds...


 89%|████████▉ | 1305/1465 [47:49<04:16,  1.61s/it]

HTML code 402 - Retry in 10 seconds...


 90%|████████▉ | 1317/1465 [48:22<03:27,  1.40s/it]

HTML code 402 - Retry in 10 seconds...


 91%|█████████ | 1330/1465 [48:50<02:35,  1.15s/it]

HTML code 402 - Retry in 10 seconds...


 92%|█████████▏| 1351/1465 [49:28<02:29,  1.31s/it]

HTML code 402 - Retry in 10 seconds...


 94%|█████████▍| 1377/1465 [50:12<02:51,  1.95s/it]

HTML code 402 - Retry in 10 seconds...


 95%|█████████▌| 1399/1465 [50:55<01:42,  1.55s/it]

HTML code 402 - Retry in 10 seconds...


 97%|█████████▋| 1422/1465 [51:37<01:09,  1.61s/it]

HTML code 402 - Retry in 10 seconds...


 98%|█████████▊| 1438/1465 [52:09<00:32,  1.21s/it]

HTML code 402 - Retry in 10 seconds...


 99%|█████████▉| 1454/1465 [52:39<00:15,  1.39s/it]

HTML code 402 - Retry in 10 seconds...


100%|██████████| 1465/1465 [53:06<00:00,  2.18s/it]


In [10]:
def to_node_id(airport):
    node = airports.loc[(airports['iata'] == airport).idxmax()]
    return node.city if not use_country and node.country == 'Vietnam' else node.country


def append_flight_to_network(flight):
    if flight['status'] == 'landed' or flight['status'] == 'diverted':
        if flight['departure_date'] not in flight_network_by_date.keys():
            flight_network_by_date[flight['departure_date']] = {}
        adj = flight_network_by_date[flight['departure_date']]
        srcId = to_node_id(flight['departure_airport'])
        desId = to_node_id(flight['arrival_airport'])
        edge = (srcId, desId)
        if edge not in adj:
            adj[edge] = 0
        adj[edge] = adj[edge] + 1


def build_flight_network():
    total_flights = len(historic_flights.index)
    for idx, flight in tqdm(historic_flights.iterrows(), total = total_flights):
        append_flight_to_network(flight)
        update_count_by_flight_status(flight, count_by_flight_status)


def write_flight_network_to_file():
    edge_list = []
    for date in sorted(flight_network_by_date.keys()):
        formatted_date = datetime.datetime.strptime(str(date), '%Y%m%d')
        adj = flight_network_by_date[date]
        for edge in adj.keys():
            edge_list.append([formatted_date.strftime('%Y-%m-%d'), edge[0], edge[1], adj[edge]])
    edge_list = pd.DataFrame(edge_list)
    if use_country:
        edge_list.to_csv("datasets/edge_lists/edge_list_all-%s.txt" % countries[0], header=False, index=False)
    else:
        edge_list.to_csv("datasets/edge_lists/edge_list-%s.txt" % countries[0], header=False, index=False)


def flight_by_status_logger(unique_flights_log_file_name):
    unique_flights_logger = logging.getLogger('flight_by_status_logger')
    handler = logging.FileHandler(unique_flights_log_file_name, mode='a')
    handler.addFilter(InfoFilter())
    unique_flights_logger.addHandler(handler)
    unique_flights_logger.setLevel(logging.INFO)
    return unique_flights_logger

if (not os.path.exists("datasets/edge_lists")):
    os.makedirs("datasets/edge_lists")
    
    default_logger = get_logger('logs/e_info.log', 'logs/e_error.log')
    flight_by_status_logger = flight_by_status_logger("datasets/edge_lists/flight_by_status.txt")
    _countries = pd.read_csv(COUNTRIES_FILE_NAME, header=None)
    
    airports = []
    for c in _countries[0]:
        airport_file_name = "%s-%s.txt" % (AIRPORTS_FILE_PREFIX, c)
        airport = pd.read_csv(airport_file_name, names=AIRPORT_ATTRIBUTE_NAMES)
        airports.append(airport)
    airports = pd.concat(airports, axis=0, ignore_index=True)
    historic_flights_file_name = "%s-%s.txt" % (HISTORIC_FLIGHTS_FILE_PREFIX, countries[0])
    historic_flights = pd.read_csv(historic_flights_file_name, names=FLIGHTS_ATTRIBUTE_NAMES)
    flight_network_by_date = {}
    count_by_flight_status = {}
    build_flight_network()
    flight_by_status_logger.info(count_by_flight_status)
    write_flight_network_to_file()

100%|██████████| 32902/32902 [00:29<00:00, 1132.81it/s]


In [11]:
class generate_SIR_dataset():
    def __init__(self):
        self.name_to_pop = self.get_population_dict()
        self.confirmed_df = pd.read_csv("datasets/covid/time_series_covid19_confirmed_global.csv").fillna("")
        self.recovered_df = pd.read_csv("datasets/covid/time_series_covid19_recovered_global.csv").fillna("")
        self.deaths_df = pd.read_csv("datasets/covid/time_series_covid19_deaths_global.csv").fillna("")
        self.date = pd.to_datetime(list(self.confirmed_df.columns)[4:])
        
        result = self.to_df()
        result.to_csv("datasets/covid_SIR.csv", index = False)
        return None
    
    def padding(self, date, values):
        cur = 0
        results = []
        temp_values = 0
        
        for i in range(len(self.date)):
            if (self.date[i] == datetime.datetime.strptime(date[cur], '%Y-%m-%d')):
                results.append(values[cur])
                temp_values = values[cur]
                cur += 1
            else:
                results.append(temp_values)
        return results
        
    def process_Canada(self, p):
        Canada_df = pd.read_csv("datasets/covid/covid19-canada.csv").fillna(0)
        Canada_df = Canada_df[Canada_df["prname"] == p]
        if (len(Canada_df) == 0):
            return [], []
        else:
            ii = self.padding(Canada_df["date"].values.tolist(), Canada_df["numtotal"].values.tolist())
            ri = self.padding(Canada_df["date"].values.tolist(), (Canada_df["numtotal"].values - Canada_df["numactive"].values).tolist())
            return ii, ri
        
        
    def get_population_dict(self):
        population_df = pd.read_csv("datasets/covid/population.csv").fillna("")
        return {(i+j):v for i,j,v in zip(population_df["Country/Region"], population_df["Province/State"], population_df["Population"])}
    
    def to_df(self):
        date = []
        Country = []
        Province = []
        N = []
        I = []
        R = []
        
        for i, (C, P) in tqdm(enumerate(zip(self.confirmed_df["Country/Region"], self.confirmed_df["Province/State"])), total = len(self.confirmed_df)):
            try:
                confirmed_dfi = self.confirmed_df[self.confirmed_df["Country/Region"] == C]
                confirmed_dfi = confirmed_dfi[confirmed_dfi["Province/State"] == P].values[0]
                recovered_dfi = self.recovered_df[self.recovered_df["Country/Region"] == C]
                recovered_dfi = recovered_dfi[recovered_dfi["Province/State"] == P].values[0]
                deaths_dfi = self.deaths_df[self.deaths_df["Country/Region"] == C]
                if (len(deaths_dfi) == 0):
                    deaths_dfi = np.array([0]*(len(self.date + 4)))
                else:
                    deaths_dfi = deaths_dfi[deaths_dfi["Province/State"] == P].values[0]
                
                N += [self.name_to_pop[C+P]]*len(self.date)
                date += self.date
                Country += [C]*len(self.date)
                Province += [P]*len(self.date)
                I += confirmed_dfi[4:].tolist()
                R += (recovered_dfi[4:] + deaths_dfi[4:]).tolist()
            except:
                if (C == "Canada"):
                    ii, ri = self.process_Canada(P)
                    if (len(ii) == 0):
                        continue
                    else:
                        try:
                            N += [self.name_to_pop[C+P]]*len(self.date)
                            date += self.date
                            Country += [C]*len(self.date)
                            Province += [P]*len(self.date)
                            I += ii
                            R += ri
                        except:
                            pass
                continue
        
        df = pd.DataFrame()
        df["date"] = date
        df["Country/Region"] = Country
        df["Province/State"] = Province
        df["N"] = N
        df["I"] = I
        df["R"] = R
        return df
    
temp = generate_SIR_dataset()

100%|██████████| 279/279 [00:01<00:00, 184.93it/s]


In [12]:
NAME_CONVERSION = {"United States": "US",
                   "Trinidad And Tobago": "Trinidad and Tobago",
                   "Saint Pierre And Miquelon": "Saint Pierre and Miquelon",
                   "Antigua And Barbuda": "Antigua and Barbuda",
                   "Saint Vincent And The Grenadines": "Saint Vincent and the Grenadines",
                   "Saint Kitts And Nevis": "Saint Kitts and Nevis",
                   "Turks And Caicos Islands": "Turks and Caicos Islands",
                   "Sao Tome And Principe": "Sao Tome and Principe"}

BLACKLIST = []


def assert_country_in_flight_data_has_corresponding_entry_in_covid_data(df_flights, df_covid):
    for destination in pd.concat([df_flights['From'], df_flights['To']]).unique():
        if destination not in df_covid['Province/State'].unique() and destination not in df_covid['Country/Region'].unique():
            print("Not found:", destination)


# Remove country/regions in the blacklist from flights dataset
def remove_blacklist_countries(df_flights, initial_flight_count):
    for destination in pd.concat([df_flights['From'], df_flights['To']]).unique():
        if destination in BLACKLIST:
            df_flights = df_flights[df_flights['From'] != destination]
            df_flights = df_flights[df_flights['To'] != destination]
    print("removed # records", initial_flight_count - df_flights.shape[0])
    return df_flights


# Align country/region names in flights and covid dataset
def convert_country_region_names(df_flights):
    for key in NAME_CONVERSION.keys():
        df_flights = df_flights.replace({key: NAME_CONVERSION[key]})
    return df_flights


def process_flight_data():
    filename = "edge_list_all-"+countries[0]+".txt" if use_country else "edge_list-"+countries[0]+".txt"
    processed_filename = "edge_list_processed_all-"+countries[0]+".txt" if use_country else "edge_list_processed-"+countries[0]+".txt"
    df_flights_v3 = pd.read_csv("datasets/edge_lists/%s" % filename,
                                names=['date', 'From', 'To', 'number of flights'])
    df_flights_v3['date'] = pd.to_datetime(df_flights_v3['date'])
    df_covid = pd.read_csv("datasets/covid_SIR.csv")
    df_covid['date'] = pd.to_datetime(df_covid['date'])

    v3_initial_record_count = df_flights_v3.shape[0]
    print("v3 initial # records", v3_initial_record_count)
    print("v3 initial # flights", df_flights_v3['number of flights'].sum())

    df_flights_v3 = remove_blacklist_countries(df_flights_v3, v3_initial_record_count)
    df_flights_v3 = convert_country_region_names(df_flights_v3)
    print("v3 # records after name conversion and blacklist", df_flights_v3.shape[0])

    assert_country_in_flight_data_has_corresponding_entry_in_covid_data(df_flights_v3, df_covid)

    print("v3 # records before write to file", df_flights_v3.shape[0])
    print("v3 # flights before write to file", df_flights_v3['number of flights'].sum())
    df_flights_v3.to_csv("datasets/edge_lists/%s" % processed_filename, index=False, header=False)


process_flight_data()

v3 initial # records 3724
v3 initial # flights 32189
removed # records 0
v3 # records after name conversion and blacklist 3724
Not found: South Korea
v3 # records before write to file 3724
v3 # flights before write to file 32189
