In [1]:
#
# File: Assignment07_1d.py
# Name: Christopher M. Anderson
# Date: 10/18/2020
# Course: DSC650 Big Data
# Week: 7
# Assignment Number: 7.1d


# In this part of the assignment, you will
# partition a dataset using different strategies.
# You will use the routes.parquet dataset you
# created in a previous assignment. For this
# dataset, the key for each route will be the
# three-letter source airport code concatenated
# with the three-letter destination airport code
# and the two-letter airline. For instance, a
# route from Omaha Eppley Airfield (OMA) to Denver
# International Airport (DEN) on American Airlines
# (AA) has a key of OMADENAA.
#
# Create a Python function that takes as input a
# list of keys and the number of partitions and
# returns a list of keys sorted into the specified
# number of partitions. The partitions should be
# roughly equal in size. Furthermore, the partitions
# should have the property that each partition contains
# all the keys between the least key in the partition
# and the greatest key in the partition. In other words,
# the partitions should be ordered.


import os
import json
from pathlib import Path
import gzip
import pandas as pd

In [2]:
current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
# kv_dir = results_dir.joinpath('kv')
results_dir.mkdir(parents=True, exist_ok=True)

In [3]:
# 1): Load the dataset
def read_jsonl_data():
    src_data_path = 'data/routes.jsonl.gz'
    with gzip.open(src_data_path, 'rb') as f:
        records = [json.loads(line) for line in f.readlines()]

    return records

In [4]:
# 2): Flatten the records:
def flatten_record(record):
    flat_record = dict()
    for key, value in record.items():
        if key in ['airline', 'src_airport', 'dst_airport']:
            if isinstance(value, dict):
                for child_key, child_value in value.items():
                    flat_key = '{}_{}'.format(key, child_key)
                    flat_record[flat_key] = child_value
        else:
            flat_record[key] = value
    return flat_record


def create_flattened_dataset():
    records = read_jsonl_data()
    return pd.DataFrame.from_records([flatten_record(record) for record in records])


df = create_flattened_dataset()

In [5]:
# 2.5 ): Cleanup
df = df.dropna()

In [6]:
# 3): Add key column:
df['key'] = df['src_airport_iata'].map(str) + \
            df['dst_airport_iata'].map(str) + \
            df['airline_iata'].map(str)

In [7]:
# 4): Add kv_key column:
df['kv_key'] = df['key'].astype(str).str[0:4]


# Verify the updated kv_key column data:
# print(df)
# pd.set_option('display.max_columns', None)
# print(df.head())

In [8]:
# 5): DF to List:
key_data = df['kv_key']
data_keys = key_data.to_list()
# print(data_keys)

In [9]:
# 6): Inputs:
# input_keys = int(input('How many keys?: '))
input_num_partitions = int(input('How many partitions would you like to create?: '))
n = input_num_partitions

How many partitions would you like to create?: 5


In [10]:
# 7): Balance Partitions:
def balance_partitions(keys, num_partitions):
    for i in range(0, len(keys), num_partitions):
        yield keys[i:i + num_partitions]
    return balance_partitions(keys, num_partitions)


x = sorted(list(balance_partitions(data_keys, input_num_partitions)))
print(x)

[['AAEA', 'AAEC', 'AAEI', 'AAEL', 'AAEM'], ['AAEO', 'AAEO', 'ABJA', 'ALCA', 'ALCO'], ['AALA', 'AALB', 'AALC', 'AALO', 'AARC'], ['AALB', 'AALI', 'ABJI', 'ACCA', 'ACCI'], ['AALC', 'AALL', 'AALP', 'ACEL', 'AESA'], ['AANC', 'AUHC', 'AUHC', 'AUHM', 'AUHT'], ['AARB', 'AARG', 'AARO', 'ABVL', 'ABZL'], ['ABAD', 'AERD', 'ALAO', 'ALCD', 'AMMD'], ['ABAS', 'AERE', 'AERI', 'AERK', 'AERL'], ['ABDT', 'ACZM', 'ADUT', 'AFZT', 'ARNI'], ['ABEA', 'ABED', 'ABQA', 'ABQM', 'ABQS'], ['ABEA', 'ABQA', 'ABVC', 'ABVP', 'ABYA'], ['ABEP', 'ABID', 'ABQD', 'ABQL', 'ABQO'], ['ABEP', 'ABID', 'ABQD', 'ABQL', 'ABQO'], ['ABES', 'ATWA', 'ATWL', 'ATWS', 'AUSL'], ['ABJA', 'ABSA', 'ABVC', 'ACCA', 'ACCC'], ['ABJA', 'ACCA', 'ACCK', 'ACCT', 'ACCT'], ['ABJB', 'ABJL', 'ABJO', 'ABQD', 'ABQI'], ['ABJB', 'ABJL', 'ABJO', 'ABVF', 'ABVS'], ['ABJC', 'ABJL', 'ABJL', 'ABJO', 'ABVA'], ['ABJC', 'ABQA', 'ABZC', 'ACCF', 'AESA'], ['ABJD', 'ABJD', 'ABJL', 'ABJL', 'ABJO'], ['ABJL', 'ABJO', 'AGAB', 'AGPB', 'ALCB'], ['ABJR', 'ACCA', 'ACCF', 'DKRF', 