## Shuffle CSVs

Shuffle CSV files without loading them in memory. Below code creates the offsets file for each class that has the start and end index of each data point from the csv. You need to create the offsets file once for each class csv and work with them to shuffle moving forward.

In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
from tqdm import tqdm
import mmap
import random
import pickle
import re
from memory_map import MemoryMap

In [12]:
INPUT_PATH = './train_simplified'

### Create Offsets

In [20]:
def get_filenames():
    files = os.listdir(INPUT_PATH)
    return [f.split('.')[0] for f in files if re.search('\.csv$', f)]

In [7]:
filenames = get_filenames()
print(len(filenames))

340


In [8]:
memmaps = []
for index, filename in enumerate(filenames):
    memmaps.append(MemoryMap(INPUT_PATH, filename))

In [9]:
for file_index, memmap in tqdm(enumerate(memmaps)):
    file_metadata = []
    num_of_lines = 0
    start_index = 0
    for index, char in enumerate(memmap):
        if char == 10:
            num_of_lines += 1
            if num_of_lines > 1:
                file_metadata.append((start_index, index + 1))
            start_index = index + 1
    file_metadata.append((start_index, memmap.size-1))
    
    file_metadata_path = os.path.join(input_path, filenames[file_index] + ".offsets")
    with open(file_metadata_path, 'wb') as fp:
        pickle.dump(file_metadata, fp)

340it [3:37:30, 31.13s/it]


### Shuffle

In [13]:
metadata = []
for index, filename in enumerate(filenames):
    file_metadata_path = os.path.join(INPUT_PATH, filename + ".offsets")
    with open (file_metadata_path, 'rb') as fp:
        offsets = pickle.load(fp)
        metadata.extend([(index,) + offset for offset in offsets])

In [14]:
def read_line(line_pointer):
    (file_index, start, end) = line_pointer
    return memmaps[file_index].memmap[start:end-1]

In [16]:
shuffled = metadata[:]
random.shuffle(shuffled)
read_line(shuffled[0])

b'PH,"[[[80, 76, 79, 99, 128, 164, 176, 186, 189, 186], [86, 73, 56, 29, 14, 14, 25, 52, 95, 119]], [[83, 84, 103, 128, 158, 175, 186, 194, 199, 188], [86, 53, 23, 3, 0, 10, 20, 35, 91, 123]], [[88, 91, 183, 239, 253, 255, 255, 243, 234, 207, 182], [89, 85, 64, 63, 71, 77, 83, 97, 104, 116, 119]], [[83, 190], [91, 129]], [[79, 5, 0, 0, 7, 36, 135, 177, 182, 185], [88, 110, 117, 128, 131, 137, 141, 130, 132, 141]], [[6, 3, 3, 7, 20, 41, 59, 82, 156, 178, 192, 219, 230, 249], [132, 137, 150, 162, 183, 207, 215, 221, 224, 219, 212, 183, 165, 108]], [[43, 10], [139, 172]], [[85, 66, 41], [145, 161, 193]], [[122, 72], [147, 210]], [[169, 140, 110], [143, 172, 216]], [[220, 164, 141], [125, 186, 220]]]",6384796701818880,True,2017-03-23 11:58:07.536100,basket'

In [6]:
for m in memmaps:
    m.close()

In [18]:
len(shuffled)

49707919