In [1]:
#
# File: Assignment07_1b.py
# Name: Christopher M. Anderson
# Date: 10/18/2020
# Course: DSC650 Big Data
# Week: 7
# Assignment Number: 7.1b


# In this part of the assignment, you will
# partition a dataset using different strategies.
# You will use the routes.parquet dataset you
# created in a previous assignment. For this
# dataset, the key for each route will be the
# three-letter source airport code concatenated
# with the three-letter destination airport code
# and the two-letter airline. For instance, a
# route from Omaha Eppley Airfield (OMA) to Denver
# International Airport (DEN) on American Airlines
# (AA) has a key of OMADENAA.
#
# Next, we are going to partition the dataset again,
# but this time we will partition by the hash value
# of the key. The following is a function that will
# create a SHA256 hash of the input key and return a
# hexadecimal string representation of the hash.


import os
import shutil
import json
from pathlib import Path
import gzip
import hashlib
import pandas as pd

In [2]:
current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
hash_dir = results_dir.joinpath('hash')
results_dir.mkdir(parents=True, exist_ok=True)
if os.path.exists(hash_dir):
    shutil.rmtree(hash_dir)
os.makedirs(hash_dir)

In [3]:
# 1): Load the dataset
def read_jsonl_data():
    src_data_path = 'data/routes.jsonl.gz'
    with gzip.open(src_data_path, 'rb') as f:
        records = [json.loads(line) for line in f.readlines()]

    return records

In [4]:
# 2): Flatten the records:
def flatten_record(record):
    flat_record = dict()
    for key, value in record.items():
        if key in ['airline', 'src_airport', 'dst_airport']:
            if isinstance(value, dict):
                for child_key, child_value in value.items():
                    flat_key = '{}_{}'.format(key, child_key)
                    flat_record[flat_key] = child_value
        else:
            flat_record[key] = value
    return flat_record


def create_flattened_dataset():
    records = read_jsonl_data()
    return pd.DataFrame.from_records([flatten_record(record) for record in records])


df = create_flattened_dataset()

In [5]:
# 3): Add key column:
df['key'] = df['src_airport_iata'].map(str) + \
            df['dst_airport_iata'].map(str) + \
            df['airline_iata'].map(str)

In [6]:
# 4): Hash the key data:
def hash_key(key):
    m = hashlib.sha256()
    m.update(str(key).encode('utf-8'))
    return m.hexdigest()

In [7]:
# 5): Create hash column:
df['hashed'] = df['key'].apply(hash_key)

In [8]:
# 6): Add hash_key column:
df['hash_key'] = df['hashed'].astype(str).str[0]

# Verify the hash_key column data:
print(df)
pd.set_option('display.max_columns', None)
print(df.head())

       airline_airline_id      airline_name           airline_alias  \
0                     410        Aerocondor  ANA All Nippon Airways   
1                     410        Aerocondor  ANA All Nippon Airways   
2                     410        Aerocondor  ANA All Nippon Airways   
3                     410        Aerocondor  ANA All Nippon Airways   
4                     410        Aerocondor  ANA All Nippon Airways   
...                   ...               ...                     ...   
67658                4178  Regional Express          Qantas Airways   
67659               19016        Apache Air                  Apache   
67660               19016        Apache Air                  Apache   
67661               19016        Apache Air                  Apache   
67662               19016        Apache Air                  Apache   

      airline_iata airline_icao airline_callsign airline_country  \
0               2B          ARD       AEROCONDOR        Portugal   
1          

In [9]:
# 7): Write flattened df to parquet:
def write_parquet_file():
    pq_flat_path = results_dir.joinpath('routes-flattened.parquet')
    df.to_parquet(pq_flat_path)


write_parquet_file()

In [11]:
# 8): Write flattened df to partitioned parquet:
def write_partitioned_parquet_files():
    hash_key_path = hash_dir.joinpath()
    df.to_parquet(hash_key_path, partition_cols=['hash_key'])


write_partitioned_parquet_files()