# Dask DataFrame Experiment

## Part 1

Get fake data from mimesis for the dataframes that will be used in the experiment.

## Imports

In [1]:
# ! pip install mimesis

In [1]:
from mimesis import Person, Address, Datetime
from mimesis.enums import Gender

import os
import math
import time
import pickle
import random
import pandas as pd

from itertools import accumulate

# %config IPCompleter.greedy=True

## Create mockup data and accompanying layouts

In [2]:
person = Person('en')
address = Address('en')
datetime = Datetime()

### Functions to generate the data

In [3]:
def generate_person():
    return {
        'rec': 'PRSN',
        'id': random.randint(10000, 100000000),
        'first_name': person.first_name(),
        'last_name': person.last_name(),
        'gender': person.gender(),
        'ssn': random.randint(1000000, 100000000),
    }

def generate_address(id, ssn):
    return {
        'rec': 'ADDR',
        'id': id,
        'ssn': ssn,
        'address': address.address(),
        'city': address.city(),
        'state': address.state(),
        'country': 'USA',
        'begin': datetime.date(start=2000, end=2019)
    }

def generate_email(id, ssn):
    return {
        'rec': 'EMAL',
        'id': id,
        'ssn': ssn,
        'email': person.email(),
        'begin': datetime.date(start=2000, end=2019)
    }

def create_accounts(num=1):
    output = []
    for _a in range(num):
        p = generate_person()
        output.append(p)
        for _b in range(random.randint(1, 5)):
            output.append(generate_address(p['id'], p['ssn']))
        for _c in range(random.randint(1, 5)):
            output.append(generate_email(p['id'], p['ssn']))
    return output

### Functions to create the layout from the generated data

In [4]:
# For padding length of variables to provide 'space' in the final layout
def round_up_5(num):
    return int(math.ceil(num/5)*5)

# Get the length of each column that will fit the max length values and pad for some extra width
def create_lengths(df):
    d = {}
    for col in df.columns:
        try:
            length = df[df[col].notnull()][col].str.len().max()
            length = round_up_5(int(length*1.5))
        except:
            if df[col].dtype == 'O':
                length = 10
            else:
                length = 15
        d[col] = length
    return d

# Convert a record to a layout using the field length information
def layout_from_record(d: dict, length_d: dict):
    length = [length_d[k] for k in d.keys()]
    
    return {
        'rec': ["DTL"+d['rec'] for _ in range(len(d.keys()))],
        'name': list(d.keys()),
        'start': [x+1 for x in [0]+list(accumulate(length[:-1]))],
        'length': length,
        'conversion': ['CHAR' for _ in range(len(d.keys()))],
    }

# Convert a data dataframe to a layout
def convert_data_to_layout(df):
    # Get the lengths of the data
    length_d = create_lengths(df)

    # Get the layouts for the 3 records
    layout_df = pd.concat([
        pd.DataFrame(layout_from_record(generate_person(), length_d)),
        pd.DataFrame(layout_from_record(generate_address(1, 2), length_d)),
        pd.DataFrame(layout_from_record(generate_email(1, 2), length_d)),
    ])

    # Make sure the records are sorted correctly
    layout_df.sort_values(by=['rec', 'start'], inplace=True)

    return layout_df

### Function to create a record definition table

Note -- Based on experience, we 'know' that the "rec" column is the column that defines the layout of the data

Ideally, this file specification would be stored in a database or other documentation repository.

In [5]:
# Convert a data dataframe to a layout
def convert_data_to_record_definitions(df):

    # Unique 'rec' values
    records = df['rec'].unique().tolist()

    # Construct the dataframe definitions from the record 'rec' values
    definitions = ["data_df['rec']=='" + rec + "'" for rec in records]

    # Construct the record labels from the record 'rec' values
    records = ["DTL"+rec for rec in records]

    # Zip the 2 lists together into a 'records' definition dataframe
    records_df = pd.DataFrame(zip(records, definitions), columns=['Record', 'Definition'])
    return records_df

### Generate the data

Generate sets of data starting at 5000 and increasing by an order of magnitude.

In [6]:
# Data location
DATA = 'data'

def generate_data(count=5000):
    file_path_data = os.path.join(os.getcwd(), DATA, str(count)+'_data.pickle')
    file_path_layout = os.path.join(os.getcwd(), DATA, str(count)+'_layout.pickle')
    file_path_records = os.path.join(os.getcwd(), DATA, str(count)+'_records.pickle')

    data_df = pd.DataFrame(create_accounts(count))
    layout_df = convert_data_to_layout(data_df)
    records_df = convert_data_to_record_definitions(data_df)
    
    with open(file_path_data, 'wb') as f:
        pickle.dump(data_df, f)
    with open(file_path_layout, 'wb') as f:
        pickle.dump(layout_df, f)
    with open(file_path_records, 'wb') as f:
        pickle.dump(records_df, f)
    

In [7]:
%%time
generate_data(5000)
# generate_data(50000)
# generate_data(500000)
# generate_data(5000000)

Wall time: 30min 3s


### Time taken to create the data

- 5,000 person accounts takes approx. 0.5 seconds --> 34,932 records
- 50,000 person accounts takes approx. 5 seconds --> 350,196 records
- 500,000 person accounts takes approx. 57 seconds --> 3,500,295 records
- 5,000,000 person accounts takes approx. 1800* seconds --> 34,999,259 records

Note - The generation of 5 million person accounts resulted in a RAM limit that slowed down the generation of data.

In [10]:
os.listdir(os.path.join(os.getcwd(), DATA))

['5000000_data.pickle',
 '5000000_layout.pickle',
 '5000000_records.pickle',
 '500000_data.pickle',
 '500000_layout.pickle',
 '500000_records.pickle',
 '50000_data.pickle',
 '50000_layout.pickle',
 '50000_records.pickle',
 '5000_data.pickle',
 '5000_layout.pickle',
 '5000_records.pickle']