In [257]:
import dask.bag as db

Helper function that displays all the functions that can be invoked on an object.

In [258]:
def display_object_method(obj):
    object_methods = [method_name for method_name in dir(obj)
                  if callable(getattr(obj, method_name))]
    print(object_methods)

Stand in function for creating the json files.

Will be updated later

In [259]:
def create_json_file(data, filename):
    print(data)
    print(filename)

Computes the top K words from the bag of words for each of the subproject.

In [260]:
def compute_top_k(words, k=40):
    word_frequencies = words.frequencies()
    top40_words = word_frequencies.topk(k, key=1)
    word_counts = top40_words.compute()
    print(word_counts)
    return word_counts

## Subproject 1
### Top 40 words across all files
- Find top 40 words
- No need to filter out stop words

In [261]:
word_bag = db.read_text([
    "handout/data/pg36.txt",
    "handout/data/pg3207.txt",
    "handout/data/4300-0.txt",
    "handout/data/pg19033.txt",
    "handout/data/pg1497.txt",
    "handout/data/pg42671.txt",
    "handout/data/pg514.txt",
    "handout/data/pg6130.txt"
])

In [279]:
stripped_words = word_bag.str.strip()

In [280]:
split_words = stripped_words.str.split(" ")

In [281]:
word_array = split_words.flatten()

In [277]:
lowercase_words = word_array.map(lambda x: x.lower())

In [278]:
top_40 = compute_top_k(lowercase_words, k=40)

[('the', 78837), ('and', 45168), ('of', 44739), ('', 35057), ('to', 33436), ('a', 24234), ('in', 22126), ('that', 14818), ('he', 13019), ('is', 12918), ('his', 12270), ('i', 11044), ('with', 10296), ('for', 10036), ('as', 9639), ('be', 8834), ('was', 8787), ('not', 8141), ('it', 8123), ('but', 7856), ('by', 7701), ('or', 7407), ('her', 7403), ('they', 6735), ('which', 6517), ('you', 6354), ('on', 6214), ('from', 5811), ('at', 5695), ('are', 5590), ('she', 5458), ('all', 5437), ('their', 5285), ('have', 5146), ('had', 4647), ('this', 4090), ('my', 3841), ('so', 3710), ('we', 3629), ('no', 3620)]


## Subproject 2
### Top 40 filtered words
- Find top 40 words
- Filter out the stopwords from the handout file.

In [267]:
def get_filtered_words(words):
    stopwords = open("handout/data/stopwords.txt", "r").read().split("\n")
    return words.filter(lambda x: x not in stopwords)

In [268]:
filtered_words = get_filtered_words(lowercase_words)
top_40_filtered = compute_top_k(filtered_words, 40)

[('i', 11044), ('not', 8141), ('you', 6354), ('have', 5146), ('no', 3620), ('one', 3498), ('like', 2253), ('more', 2087), ('out', 2021), ('up', 1831), ('man', 1783), ('now', 1579), ('only', 1555), ('must', 1523), ('little', 1485), ('those', 1447), ('good', 1444), ('should', 1417), ('after', 1379), ('great', 1358), ('every', 1356), ('first', 1318), ('own', 1289), ('did', 1271), ('how', 1266), ('see', 1251), ('these', 1244), ('men', 1233), ('over', 1209), ('where', 1205), ('make', 1196), ('upon', 1188), ('nor', 1181), ('never', 1177), ('much', 1167), ('time', 1166), ('said,', 1163), ('two', 1142), ('old', 1140), ('made', 1128)]


## Subproject 3
### Top 40 words without punctuations
- Find top 40 words
- Remove leading and trailing punctuation marks
- Filter out the stopwords from the handout file.

In [269]:
from string import punctuation

all_punctuations = punctuation + "—"

words_without_punctuations = lowercase_words.map(lambda x: x.strip(all_punctuations))

In [270]:
filtered_words = get_filtered_words(words_without_punctuations)
top_40_filtered = compute_top_k(filtered_words, 400)

[('i', 12361), ('not', 8833), ('you', 7780), ('have', 5294), ('no', 4073), ('one', 4026), ('man', 2655), ('more', 2445), ('like', 2414), ('out', 2354), ('now', 2249), ('up', 2217), ('good', 1917), ('men', 1886), ('mr', 1820), ('god', 1756), ('time', 1738), ('only', 1730), ('say', 1670), ('first', 1659), ('must', 1581), ('little', 1569), ('own', 1549), ('see', 1526), ('those', 1525), ('how', 1460), ('after', 1457), ('did', 1453), ('great', 1451), ('should', 1437), ('us', 1406), ('these', 1384), ('know', 1377), ('every', 1372), ('before', 1367), ('over', 1353), ('much', 1341), ('where', 1332), ('same', 1313), ('well', 1309), ('two', 1292), ('jo', 1248), ('made', 1247), ('never', 1238), ('make', 1238), ('old', 1230), ('way', 1226), ('upon', 1221), ('also', 1219), ('down', 1211), ('nor', 1208), ('power', 1199), ('another', 1179), ('too', 1168), ('life', 1163), ('though', 1157), ('most', 1153), ('being', 1149), ('day', 1137), ('yet', 1131), ('again', 1117), ('because', 1103), ('without', 10