In [103]:
import dask.bag as db

Helper function that displays all the functions that can be invoked on an object.

In [104]:
def display_object_method(obj):
    object_methods = [method_name for method_name in dir(obj)
                  if callable(getattr(obj, method_name))]
    print(object_methods)

Takes an array of lists and filename as input and creates a json file from the data.

In [105]:
import json
def create_json_file(data, filename):
    json_object = dict(data)
    json_string = json.dumps(json_object)
    open("output/%s" % filename, "w").write(json_string)
    

Computes the top K words from the bag of words for each of the subproject.

In [106]:
def compute_top_k(words, k=40):
    word_frequencies = words.frequencies()
    words_with_count_greater_than_two = word_frequencies.filter(lambda x: x[1] > 1 )
    top40_words = word_frequencies.topk(k, key=1)
    word_counts = top40_words.compute()
    print(word_counts)
    return word_counts

## Subproject 1
### Top 40 words across all files
- Find top 40 words
- No need to filter out stop words

In [144]:
booklist = [
    "handout/data/pg36.txt",
    "handout/data/pg3207.txt",
    "handout/data/4300-0.txt",
    "handout/data/pg19033.txt",
    "handout/data/pg1497.txt",
    "handout/data/pg42671.txt",
    "handout/data/pg514.txt",
    "handout/data/pg6130.txt"
]
word_bag = db.read_text(booklist, encoding="utf-8-sig")

In [145]:
def get_lowercase_words(word_bag):    
    split_words = word_bag.str.split()
    stripped_words = split_words.flatten().map(lambda x: x.strip())
    word_array = stripped_words.filter(lambda x: x!= "")
    return word_array.map(lambda x: x.lower())

In [146]:
lowercase_words = get_lowercase_words(word_bag)

In [147]:
top_40 = compute_top_k(lowercase_words, k=40)

[('the', 78844), ('and', 45168), ('of', 44739), ('to', 33436), ('a', 24234), ('in', 22126), ('that', 14818), ('he', 13019), ('is', 12918), ('his', 12270), ('i', 11044), ('with', 10296), ('for', 10036), ('as', 9639), ('be', 8834), ('was', 8787), ('not', 8141), ('it', 8123), ('but', 7856), ('by', 7701), ('or', 7407), ('her', 7403), ('they', 6735), ('which', 6517), ('you', 6354), ('on', 6214), ('from', 5811), ('at', 5695), ('are', 5590), ('she', 5458), ('all', 5437), ('their', 5285), ('have', 5146), ('had', 4647), ('this', 4090), ('my', 3841), ('so', 3710), ('we', 3629), ('no', 3620), ('if', 3571)]


In [137]:
create_json_file(top_40, "sp1.json")

## Subproject 2
### Top 40 filtered words
- Find top 40 words
- Filter out the stopwords from the handout file.

In [148]:
def get_filtered_words(words):
    stopwords = open("handout/data/stopwords.txt", "r").read().split("\n")
    return words.filter(lambda x: x not in stopwords)

In [149]:
filtered_words = get_filtered_words(lowercase_words)
top_40_filtered = compute_top_k(filtered_words, 40)

[('i', 11044), ('not', 8141), ('you', 6354), ('have', 5146), ('no', 3620), ('one', 3498), ('like', 2253), ('more', 2087), ('out', 2021), ('up', 1831), ('man', 1783), ('now', 1579), ('only', 1555), ('must', 1523), ('little', 1485), ('those', 1447), ('good', 1444), ('should', 1417), ('after', 1379), ('great', 1358), ('every', 1356), ('first', 1318), ('own', 1289), ('did', 1271), ('how', 1266), ('see', 1251), ('these', 1244), ('men', 1233), ('over', 1209), ('where', 1205), ('make', 1196), ('upon', 1188), ('nor', 1181), ('never', 1177), ('much', 1167), ('time', 1166), ('said,', 1163), ('two', 1142), ('old', 1140), ('made', 1128)]


In [140]:
create_json_file(top_40_filtered, "sp2.json")

## Subproject 3
### Top 40 words without punctuations
- Find top 40 words
- Remove leading and trailing punctuation marks
- Filter out the stopwords from the handout file.

In [170]:
from string import punctuation

def puntuation_stripper(word):
    all_punctuations = ".,:;’!?"
    if word[0] in all_punctuations:
        word = word[1:]
    if word[-1] in all_punctuations:
        word = word[:-1]
    return word

In [171]:
long_words = filtered_words.filter(lambda x: len(x) > 1)
words_without_punctuations = long_words.map(puntuation_stripper)

filtered_words_without_punctuations = get_filtered_words(words_without_punctuations)
top_40_without_punctuations = compute_top_k(filtered_words_without_punctuations, 40)

[('not', 8648), ('you', 7230), ('have', 5252), ('one', 3954), ('no', 3871), ('man', 2601), ('more', 2402), ('like', 2374), ('out', 2313), ('up', 2186), ('now', 2104), ('men', 1852), ('good', 1830), ('mr', 1784), ('only', 1703), ('time', 1697), ('god', 1665), ('first', 1645), ('say', 1611), ('must', 1569), ('little', 1551), ('own', 1527), ('those', 1514), ('see', 1494), ('after', 1437), ('great', 1434), ('should', 1429), ('did', 1399), ('us', 1377), ('these', 1366), ('every', 1357), ('before', 1341), ('over', 1340), ('know', 1334), ('how', 1308), ('much', 1307), ('same', 1305), ('where', 1275), ('two', 1263), ('made', 1243)]


In [163]:
create_json_file(top_40_without_punctuations, "sp3.json")

## Subproject 4
### TF-IDF Calculation

In [95]:
def get_words_for_document(document):
    word_bag = db.read_text(document)
    lowercase_words = get_lowercase_words(word_bag)
    long_words = lowercase_words.filter(lambda x: len(x) > 1)
    words_without_punctuations = get_words_without_punctuations(long_words)
    return get_filtered_words(words_without_punctuations)    

In [96]:
import math

def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], math.log(8/x[1])))
    return idf

In [18]:
def get_top5_TF_IDF(tf, idf):
    tf = tf.frequencies()
    tf_idf = tf.join(idf, lambda x: x[0], lambda x: x[0]).map(lambda x: (x[0][0], x[0][1] * x[1][1]))
    return tf_idf.topk(5, key=1).compute()

In [19]:
book_terms = []
for book in booklist:
    book_terms.append(get_words_for_document(book))

idf = get_IDF(book_terms)
tf_idf = []
for terms in book_terms:
    tf_idf.extend(get_top5_TF_IDF(terms, idf))

print(tf_idf)

[('martians', 330.6312051270939), ('martian', 106.74466580623158), ('woking', 103.97207708399179), ('pit', 77.48551098792638), ('heat-ray', 70.70101241711441), ('soveraign', 1023.0852385064792), ('common-wealth', 867.1271228804915), ('onely', 856.7299151720923), ('civill', 675.8185010459466), ('kingdome', 592.6408393787532), ('stephen', 1000.211381548001), ('bloom', 614.1284019761115), ('j', 365.9817113356511), ('don’t', 361.8228282522914), ('dedalus', 332.71064666877373), ('alice', 230.12486394590184), ('[illustration]', 45.747713916956386), ('rabbit', 29.424877590351784), ('mouse', 23.53990207228143), ('caterpillar', 19.408121055678468), ('republic', 228.73856958478194), ('thrasymachus', 178.8319725844659), ('glaucon', 178.8319725844659), ('plato', 151.81117224637262), ('adeimantus', 149.71979100094816), ('bennet', 576.0053070453145), ('bingley', 494.9070869198009), ('darcy', 474.11267150300256), ('elizabeth', 402.0253647247683), ('jane', 350.7324733633323), ('jo', 1651.0765840937897

In [21]:
create_json_file(tf_idf, "sp4.json")