In [14]:
import dask.bag as db
import re
import pandas as pd
import numpy as np
import unidecode as uni

In [2]:
ok_bag = db.from_url('https://www.gutenberg.org/files/1342/1342-0.txt')

In [6]:
ok_bag.take(5)

(b'\xef\xbb\xbf\r\n',
 b'The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\r\n',
 b'\r\n',
 b'This eBook is for the use of anyone anywhere at no cost and with\r\n',
 b'almost no restrictions whatsoever.  You may copy it, give it away or\r\n')

In [8]:
ok_strip = ok_bag.map(lambda x: x.strip())

In [9]:
ok_strip.take(10)

(b'\xef\xbb\xbf',
 b'The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen',
 b'',
 b'This eBook is for the use of anyone anywhere at no cost and with',
 b'almost no restrictions whatsoever.  You may copy it, give it away or',
 b're-use it under the terms of the Project Gutenberg License included',
 b'with this eBook or online at www.gutenberg.org',
 b'',
 b'',
 b'Title: Pride and Prejudice')

In [10]:
def encode_ascii(x):
    return x.decode('ascii','ignore')

In [11]:
ok_strip_encode = ok_strip.map(encode_ascii)

In [12]:
ok_strip_encode.take(19)

('',
 'The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '',
 'Title: Pride and Prejudice',
 '',
 'Author: Jane Austen',
 '',
 'Release Date: August 26, 2008 [EBook #1342]',
 'Last Updated: November 12, 2019',
 '',
 '',
 'Language: English',
 '')

In [15]:
def f_unidecode(x):
    return uni.unidecode(x)

In [20]:
ok_strip_encode_unidecode = ok_strip_encode.map(f_unidecode)

In [21]:
ok_strip_encode_unidecode.take(10)

('',
 'The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or',
 're-use it under the terms of the Project Gutenberg License included',
 'with this eBook or online at www.gutenberg.org',
 '',
 '',
 'Title: Pride and Prejudice')

In [26]:
ok_strip_encode_unidecode_lower = ok_strip_encode_unidecode.map(str.lower)

In [27]:
ok_strip_encode_unidecode_lower.take(10)

('',
 'the project gutenberg ebook of pride and prejudice, by jane austen',
 '',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org',
 '',
 '',
 'title: pride and prejudice')

In [29]:
ok_strip_encode_unidecode_lower_notnull = ok_strip_encode_unidecode_lower.filter(lambda x: x!='')

In [30]:
ok_strip_encode_unidecode_lower_notnull.take(10)

('the project gutenberg ebook of pride and prejudice, by jane austen',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org',
 'title: pride and prejudice',
 'author: jane austen',
 'release date: august 26, 2008 [ebook #1342]',
 'last updated: november 12, 2019',
 'language: english')

In [39]:
ok = ok_strip_encode_unidecode_lower_notnull.map(str.split)

In [40]:
ok_flatten = ok.flatten()

In [41]:
ok_flatten.take(10)

('the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'pride',
 'and',
 'prejudice,',
 'by',
 'jane')

In [43]:
ok_tuple_1 = ok_flatten.map(lambda x: (x,1))

In [45]:
ok_tuple_1.take(10)

(('the', 1),
 ('project', 1),
 ('gutenberg', 1),
 ('ebook', 1),
 ('of', 1),
 ('pride', 1),
 ('and', 1),
 ('prejudice,', 1),
 ('by', 1),
 ('jane', 1))

In [47]:
group_words = ok_flatten.groupby(lambda x: x)

In [48]:
group_words.take(10)

(('the',
  ['the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
   'the',
 

In [49]:
group_words_count = group_words.map(lambda x: (x[0],len(x[1])))

In [50]:
group_words_count.take(10)

(('the', 4495),
 ('project', 87),
 ('gutenberg', 28),
 ('ebook', 10),
 ('of', 3688),
 ('pride', 30),
 ('and', 3495),
 ('prejudice,', 2),
 ('by', 653),
 ('jane', 155))

In [56]:
def add_bin_op(count,x):
    return count + x[1]
def add_combine_op(x,y):
    return x+y

In [57]:
ok_fold = ok_tuple_1.foldby(lambda x: x[0], add_bin_op,0, add_combine_op)

In [58]:
ok_fold.take(10)

(('the', 4495),
 ('project', 87),
 ('gutenberg', 28),
 ('ebook', 10),
 ('of', 3688),
 ('pride', 30),
 ('and', 3495),
 ('prejudice,', 2),
 ('by', 653),
 ('jane', 155))

In [63]:
from spacy.lang.en import STOP_WORDS

In [68]:
ok_sin_stop_words = ok_flatten.filter(lambda x: x not in STOP_WORDS)

In [70]:
ok_frequencies = ok_sin_stop_words.frequencies()

In [74]:
ok_frequencies.topk(100,key=lambda x: x[1]).compute()

[('mr.', 782),
 ('elizabeth', 400),
 ('said', 343),
 ('mrs.', 343),
 ('miss', 283),
 ('darcy', 216),
 ('soon', 200),
 ('know', 191),
 ('think', 191),
 ('lady', 172),
 ('bennet', 168),
 ('little', 167),
 ('good', 165),
 ('shall', 161),
 ('jane', 155),
 ('bingley', 150),
 ('elizabeth,', 140),
 ('time', 136),
 ('it.', 135),
 ('great', 132),
 ('it,', 130),
 ('young', 126),
 ('chapter', 122),
 ('dear', 117),
 ('however,', 117),
 ('her,', 115),
 ('and,', 106),
 ('him,', 104),
 ('darcy,', 102),
 ('her.', 101),
 ('collins', 101),
 ('bennet,', 98),
 ('him.', 97),
 ('thought', 97),
 ('hope', 95),
 ('having', 95),
 ('long', 94),
 ('saw', 94),
 ('wish', 93),
 ('till', 90),
 ('felt', 90),
 ('sister', 90),
 ('wickham', 90),
 ('man', 89),
 ('project', 87),
 ('you,', 87),
 ('lydia', 82),
 ('sisters', 81),
 ('catherine', 81),
 ('me,', 79),
 ('day', 79),
 ('cried', 77),
 ('came', 76),
 ('jane,', 76),
 ('let', 75),
 ('come', 75),
 ('heard', 74),
 ('family', 74),
 ('like', 73),
 ('replied', 72),
 ('father