<h1>GRUPO ZAP: DATA SCIENCE CHALLENGE</h1>

In [15]:
import pandas as pd

RAW_PATH   = '../data-raw/'
GET_DATA   = False
PARSE_DATA = False

In [2]:
def download_json(url: str, path: str, filename: str) -> None:
    '''
    Automates JSON extraction from Grupo Zap raw data zipfiles,
    saving the result inside the path provided
    
    Args:
        url (str): url to get the zip from
        path (str): directory where to save the data
        filename (str): name of the resulting JSON (will concatenate with '.json')
    '''
    
    import os
    import errno
    from urllib import request
    from zipfile import ZipFile
        
    try:
        os.makedirs(path)
        
    except OSError as err:
        if err.errno != errno.EEXIST:
            raise
    
    r    = request.urlopen(url)
    data = r.read()

    with open(path + filename + '.zip', 'wb') as f:       
        f.write(data)
        
    dir_state = os.listdir(path)
    
    with ZipFile(path + filename + '.zip', 'r') as _zip:
        _zip.extractall(path)
        
    extracted = [name for name in os.listdir(path) if name not in dir_state][0]
    
    os.rename(path + extracted, path + filename + '.json')
    os.remove(path + filename + '.zip')

In [3]:
def parse_json(path: str, verbose: bool = True) -> pd.DataFrame:
    '''
    Converts a nested JSON file to a pd.DataFrame
    
    Args:
        path (str): path to json file
        verbose (bool): toogles loading percentage
        
    Returns:
        <pd.DataFrame>
    '''
    
    import json
    import pandas as pd
    from IPython.display import clear_output

    out = []
    with open(path) as f:
        document = []

        for line in f:
            document.append(line)

        ttl = len(document)
        i   = 1    
        for record in document:
            clear_output(wait=True)

            parse       = json.loads(record)
            df_unnested = pd.io.json.json_normalize(parse, sep='_')
            out.append(df_unnested)
            
            if verbose:
                print(f'Loading {path}: {round((i / ttl) * 100, 2)}%')
                i += 1

    return pd.concat(out, ignore_index=True, copy=False, sort=False)

In [4]:
if GET_DATA:    
    url_train = 'https://s3.amazonaws.com/grupozap-data-challenge/data/source-4-ds-train.json.zip'
    url_test  = 'https://s3.amazonaws.com/grupozap-data-challenge/data/source-4-ds-test.json.zip'

    download_json(url_train, RAW_PATH, 'train-raw')
    download_json(url_test, RAW_PATH, 'test-raw')

In [5]:
if PARSE_DATA:
    json_train = RAW_PATH + 'train-raw.json'
    json_test  = RAW_PATH + 'test-raw.json'

    df_train = parse_json(json_train)
    df_test  = parse_json(json_test)

Loading ../data-raw/test-raw.json: 100.0%
