-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfake_data.py
More file actions
46 lines (34 loc) · 1.06 KB
/
fake_data.py
File metadata and controls
46 lines (34 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import uuid
import pandas as pd
import numpy as np
import os
from multiprocessing import Pool, cpu_count
import shutil
NUM_CLASSES = 30
BATCH_SIZE = 2_000_000
NUM_CLIENTS = 60_000_000
PATH = '../resources/inline.parquet'
class RandomData:
def __init__(self,
num_rows: int,
num_classes: int,
path: str) -> None:
self.num_rows = num_rows
self.num_classes = num_classes
self.path = path
def generate_file(self,
filename: str):
df = pd.DataFrame({
**{"id_client": [str(uuid.uuid4()) for _ in range(self.num_rows)]},
**{f"class_{c}": np.random.uniform(size=self.num_rows)
for c in range(self.num_classes)}
})
df.to_parquet(f"{self.path}/{filename}")
if os.path.isdir(PATH):
shutil.rmtree(PATH)
os.mkdir(PATH)
rd = RandomData(BATCH_SIZE, NUM_CLASSES, PATH)
with Pool(cpu_count()) as p:
p.map(rd.generate_file,
[str(uuid.uuid4())
for _ in range(NUM_CLIENTS // BATCH_SIZE)])