# Using Dask for parallelization of in-core data

This example shows how Dask can drastically increase performance even for data fitting into RAM. 

Let's say we have a big list of companies names (~7.5 million names) stored in memory.

Our goal is to construct a frequency table of all words in the company names.

In [2]:
len(names)

7592219

In [3]:
names[:20]

(u'CHICKEN SHAK LTD',
 u'Quality Installation Services',
 u'Pine Shadows Elementary School',
 u'Clay County Information',
 u'Denessen Contracting',
 u'VERITY SMITH FACE AND BODY LIMITED',
 u'AllNations Bank',
 u'LEARNING ABOUT DOGS LIMITED',
 u'Moss Printing Inc.',
 u'Touei Housing Corp (8875)',
 u'Fts Travel',
 u'Jencraft',
 u'Dee Empresa Hotel',
 u'PLAYOUT MARKETING LIMITED',
 u'GUTSTECH',
 u'Honey Crust Bakery Co Limited',
 u'Off Site Manufacturing',
 u'Lapatra Architects',
 u'Island Rent A Car',
 u'Compton Nursery')

## Naive approach

Take each name, split into words, normalize them and count frequencies using `Counter`.

In [None]:
import re
import nltk
from collections import Counter
from nltk.corpus import stopwords

def count_words():
    counter = Counter()
    for name in names:
        counter.update([word for word in nltk.word_tokenize(name.lower()) 
                        if word not in stopwords.words('english') 
                        and re.search("^[0-9a-zA-Z]+$", word) is not None])
%time count_words()

## Dask.Bag approach

In [9]:
import re
import nltk
import pandas as pd
from dask.bag as bag
from nltk.corpus import stopwords

In [13]:
names_bag = bag.from_sequence(names)  # create a Dask.Bag from a list of names 

In [4]:
names_bag.take(20)  # take a look at the data

(u'CHICKEN SHAK LTD',
 u'Quality Installation Services',
 u'Pine Shadows Elementary School',
 u'Clay County Information',
 u'Denessen Contracting',
 u'VERITY SMITH FACE AND BODY LIMITED',
 u'AllNations Bank',
 u'LEARNING ABOUT DOGS LIMITED',
 u'Moss Printing Inc.',
 u'Touei Housing Corp (8875)',
 u'Fts Travel',
 u'Jencraft',
 u'Dee Empresa Hotel',
 u'PLAYOUT MARKETING LIMITED',
 u'GUTSTECH',
 u'Honey Crust Bakery Co Limited',
 u'Off Site Manufacturing',
 u'Lapatra Architects',
 u'Island Rent A Car',
 u'Compton Nursery')

In [34]:
# define helper functions
no_stopwords = lambda x: x not in stopwords.words('english')
is_word = lambda x: re.search("^[0-9a-zA-Z]+$", x) is not None

In [40]:
# the main pipeline
names_counts = (names_bag.map(nltk.word_tokenize).concat()  # tokenize each name and concatenate all sublists of words into one list
                .map(lambda x: x.lower())
                .filter(no_stopwords)
                .filter(is_word).frequencies())

In [43]:
%time values = names_counts.compute()

CPU times: user 2min 11s, sys: 18.7 s, total: 2min 29s
Wall time: 7min 47s


In [44]:
len(values)

1712054

In [45]:
sorted_counts = sorted(values, key=lambda x: x[1], reverse=True)

In [46]:
sorted_counts[:100]

[(u'inc', 630006),
 (u'limited', 467549),
 (u'ltd', 448123),
 (u'llc', 276294),
 (u'group', 175037),
 (u'services', 166412),
 (u'solutions', 138319),
 (u'consulting', 119824),
 (u'co', 100037),
 (u'design', 89707),
 (u'de', 84081),
 (u'management', 80216),
 (u'company', 76966),
 (u'international', 70972),
 (u'systems', 66150),
 (u'media', 64676),
 (u'associates', 58710),
 (u'gmbh', 58630),
 (u'construction', 57867),
 (u'marketing', 57505),
 (u'school', 56117),
 (u'business', 53091),
 (u'corp', 51425),
 (u'center', 50980),
 (u'technologies', 50114),
 (u'service', 43261),
 (u'technology', 42886),
 (u'studio', 39990),
 (u'home', 39407),
 (u'care', 39240),
 (u'pvt', 38573),
 (u'corporation', 38489),
 (u'enterprises', 37888),
 (u'engineering', 37386),
 (u'pty', 36433),
 (u'health', 35891),
 (u'insurance', 34082),
 (u'capital', 33891),
 (u'law', 33846),
 (u'global', 33352),
 (u'financial', 33057),
 (u'development', 32527),
 (u'new', 32163),
 (u'e', 32020),
 (u'church', 31601),
 (u'production

In [None]:
names_ex = names_bag.take(20)
