In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hashlib
plt.style.use('ggplot')


def extendDataframe(df):
    copy = df.copy()
    copy['t'] += df['t'].max() + 1
    return df.append(copy, ignore_index=True)

def duplicate(df, times):
    for i in range(times):
        df = extendDataframe(df)
    return df

def describe(df, name):
    nids = df['id'].nunique()
    minId = df['id'].min()
    maxId = df['id'].max()
    print(f'- {name} has {df["id"].nunique()} IDs [{minId} - {maxId}] and {df.size} tuples')

    
def plotHist(df, key, bins=10):
    fig, ax = plt.subplots()
    df[key].hist(ax=ax, bins=bins)
    ax.set(title=f'{key} distribution', xlabel=key, ylabel='count')
    plt.show()
    
def convertToFileID(key, parallelism):
    s = str(key)
    return int(hashlib.sha1(s.encode('utf-8')).hexdigest(), 16) % parallelism

DATA_DIR='../../data/input'
FILENAME = 'h1_server'
EXTENSION = 'txt'

# SG: timestamp, id, value
# LR: type, time, id, ...
TIMESTAMP_INDEX = 1
ID_INDEX = 2

In [None]:
df = pd.read_csv(f'{DATA_DIR}/{FILENAME}.{EXTENSION}', header=None)
df = df.rename(columns = {df.columns[TIMESTAMP_INDEX]: 't', df.columns[ID_INDEX]: 'id'})

In [None]:
plotHist(df, 'id')

In [None]:
# Split configuration
DUPLICATIONS = 0
DRY_RUN = False
PARALLELISMS = [1, 2, 4, 8, 16]

In [None]:
describe(df,'Initial Dataframe')
for parallelism in PARALLELISMS:
    assert parallelism >= 1
    assert DUPLICATIONS >= 0
    for i in range(parallelism):
        print(f'> Processing parallelism {parallelism}')
        df['file'] = df['id'].apply(convertToFileID, parallelism=parallelism)
        plotHist(df, 'file', bins=parallelism)
        split = df[df['file'] == i].drop(columns=['file'])
        print(split.head(3))
        name=f'{FILENAME}_{parallelism}_{i}'
        describe(split, name)
        split = duplicate(split, DUPLICATIONS)
        if not DRY_RUN:
            split.to_csv(f'{DATA_DIR}/{name}.txt', header=False, index=False)
    print('> Done!')
print('> All done!')