forked from dask/dask
/
datasets.py
142 lines (116 loc) · 4.3 KB
/
datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from __future__ import absolute_import, print_function, division
import random
from .utils import import_required
def timeseries(
start='2000-01-01',
end='2000-01-31',
freq='1s',
partition_freq='1d',
dtypes={'name': str, 'id': int, 'x': float, 'y': float},
seed=None,
):
""" Create timeseries dataframe with random data
Parameters
----------
start : datetime (or datetime-like string)
Start of time series
end : datetime (or datetime-like string)
End of time series
dtypes : dict
Mapping of column names to types.
Valid types include {float, int, str, 'category'}
freq : string
String like '2s' or '1H' or '12W' for the time series frequency
partition_freq : string
String like '1M' or '2Y' to divide the dataframe into partitions
seed : int (optional)
Randomstate seed
Examples
--------
>>> import dask
>>> df = dask.datasets.timeseries()
>>> df.head() # doctest: +SKIP
timestamp id name x y
2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633
2000-01-01 00:00:01 1066 Michael -0.262136 0.307107
2000-01-01 00:00:02 988 Wendy -0.526331 0.128641
2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270
2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278
"""
from dask.dataframe.io.demo import make_timeseries
return make_timeseries(start=start, end=end, freq=freq,
partition_freq=partition_freq,
seed=seed, dtypes=dtypes)
def _generate_mimesis(field, schema_description, records_per_partition, seed):
""" Generate data for a single partition of a dask bag
See Also
--------
_make_mimesis
"""
from mimesis.schema import Schema, Field
field = Field(seed=seed, **field)
schema = Schema(schema=lambda: schema_description(field))
for i in range(records_per_partition):
yield schema.create(iterations=1)[0]
def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):
"""
Make a Dask Bag filled with data randomly generated by the mimesis projet
Parameters
----------
field: dict
keyword arguments to pass to ``mimesis.Field``
schema: Callable[Field] -> dict
The schema to use to generate the data
npartitions: int
records_per_partition: int
seed: int, None
Seed for random data
Returns
-------
Dask Bag
See Also
--------
make_people
"""
import dask.bag as db
from dask.base import tokenize
field = field or {}
if seed is None:
seed = random.random()
seeds = db.core.random_state_data_python(npartitions, seed)
name = 'mimesis-' + tokenize(field, schema, npartitions, records_per_partition, seed)
dsk = {(name, i): (_generate_mimesis, field, schema, records_per_partition, seed)
for i, seed in enumerate(seeds)}
return db.Bag(dsk, name, npartitions)
def make_people(npartitions=10, records_per_partition=1000, seed=None, locale='en'):
""" Make a dataset of random people
This makes a Dask Bag with dictionary records of randomly generated people.
This requires the optional library ``mimesis`` to generate records.
Paramters
---------
npartitions : int
Number of partitions
records_per_partition : int
Number of records in each partition
seed : int, (optional)
Random seed
locale : str
Language locale, like 'en', 'fr', 'zh', or 'ru'
Returns
-------
b: Dask Bag
"""
import_required('mimesis',
'The mimesis module is required for this function. Try:\n'
' pip install mimesis')
schema = lambda field: {
'age': field('person.age'),
'name': (field('person.name'), field('person.surname')),
'occupation': field('person.occupation'),
'telephone': field('person.telephone'),
'address': {'address': field('address.address'),
'city': field('address.city')},
'credt-card': {'number': field('payment.credit_card_number'),
'expiration-date': field('payment.credit_card_expiration_date')},
}
return _make_mimesis({'locale': locale}, schema, npartitions, records_per_partition, seed)