In [33]:
import dask.bag as db

## Read text file to bag

In [34]:
b = db.read_text("animals.txt").map(lambda x: x.strip()).repartition(2)

In [35]:
b.compute()

['dog', 'cat', 'snake', 'mouse']

In [36]:
b

dask.bag<repartition, npartitions=2>

In [51]:
type(b.compute())

list

## Convert bag to DataFrame

In [53]:
ddf = b.to_dataframe(columns=["animal_type"])

In [54]:
print(ddf.compute())

  animal_type
0         dog
1         cat
0       snake
1       mouse


In [55]:
ddf.npartitions

2

In [61]:
ddf["len_animal_type"] = ddf["animal_type"].apply(
    lambda x: len(x), meta=("animal_type", "int64")
)

In [63]:
print(ddf.compute())

  animal_type  len_animal_type
0         dog                3
1         cat                3
0       snake                5
1       mouse                5


## Write text files

In [48]:
animals_length = b.map(lambda x: str(len(x)))

In [49]:
animals_length.compute()

['3', '3', '5', '5']

In [64]:
animals_length.to_textfiles("../tmp/animals_length")

['/Users/powers/Documents/code/coiled/coiled-resources/local/../tmp/animals_length/0.part',
 '/Users/powers/Documents/code/coiled/coiled-resources/local/../tmp/animals_length/1.part']

## Read JSON to bag

In [80]:
import json

In [81]:
pets = db.read_text("pets1.json")

In [82]:
lines = pets.map(json.loads)

In [83]:
cats = lines.filter(lambda d: d["species"] == "cat")

In [84]:
cats.compute()

[{'name': 'fluffy', 'age': 5, 'species': 'cat'},
 {'name': 'harvey', 'age': 8, 'species': 'cat'},
 {'name': 'chunkers', 'age': 9, 'species': 'cat'}]

In [85]:
cats.map(lambda d: d["name"]).compute()

['fluffy', 'harvey', 'chunkers']

In [86]:
lines.take(2)

({'name': 'fluffy', 'age': 5, 'species': 'cat'},
 {'name': 'fido', 'age': 2, 'species': 'dog'})

In [87]:
lines.compute()

[{'name': 'fluffy', 'age': 5, 'species': 'cat'},
 {'name': 'fido', 'age': 2, 'species': 'dog'},
 {'name': 'harvey', 'age': 8, 'species': 'cat'},
 {'name': 'chunkers', 'age': 9, 'species': 'cat'},
 {'name': 'lola', 'age': 1, 'species': 'bird'}]

## Read large text file to bag