In [1]:
import dask.bag as db

Helper function that displays all the functions that can be invoked on an object.

In [2]:
def display_object_method(obj):
    object_methods = [method_name for method_name in dir(obj)
                  if callable(getattr(obj, method_name))]
    print(object_methods)

Takes an array of lists and filename as input and creates a json file from the data.

In [3]:
import json
def create_json_file(data, filename):
    json_object = dict(data)
    json_string = json.dumps(json_object)
    open("output/%s" % filename, "w").write(json_string)
    

Computes the top K words from the bag of words for each of the subproject.

In [4]:
def compute_top_k(words, k=40):
    word_frequencies = words.frequencies()
    top40_words = word_frequencies.topk(k, key=1)
    word_counts = top40_words.compute()
    print(word_counts)
    return word_counts

## Subproject 1
### Top 40 words across all files
- Find top 40 words
- No need to filter out stop words

In [5]:
booklist = [
    "handout/data/pg36.txt",
    "handout/data/pg3207.txt",
    "handout/data/4300-0.txt",
    "handout/data/pg19033.txt",
    "handout/data/pg1497.txt",
    "handout/data/pg42671.txt",
    "handout/data/pg514.txt",
    "handout/data/pg6130.txt"
]
word_bag = db.read_text(booklist)

In [6]:
def get_lowercase_words(word_bag):    
    split_words = word_bag.str.split()
    stripped_words = split_words.flatten() #.map(lambda x: x.strip())
    word_array = stripped_words.filter(lambda x: x!= "")
    return word_array.map(lambda x: x.lower())

In [7]:
lowercase_words = get_lowercase_words(word_bag)

In [8]:
top_40 = compute_top_k(lowercase_words, k=40)

[('the', 78837), ('and', 45168), ('of', 44739), ('to', 33436), ('a', 24234), ('in', 22126), ('that', 14818), ('he', 13019), ('is', 12918), ('his', 12270), ('i', 11044), ('with', 10296), ('for', 10036), ('as', 9639), ('be', 8834), ('was', 8787), ('not', 8141), ('it', 8123), ('but', 7856), ('by', 7701), ('or', 7407), ('her', 7403), ('they', 6735), ('which', 6517), ('you', 6354), ('on', 6214), ('from', 5811), ('at', 5695), ('are', 5590), ('she', 5458), ('all', 5437), ('their', 5285), ('have', 5146), ('had', 4647), ('this', 4090), ('my', 3841), ('so', 3710), ('we', 3629), ('no', 3620), ('if', 3571)]


In [9]:
create_json_file(top_40, "sp1.json")

## Subproject 2
### Top 40 filtered words
- Find top 40 words
- Filter out the stopwords from the handout file.

In [10]:
def get_filtered_words(words):
    stopwords = open("handout/data/stopwords.txt", "r").read().split("\n")
    return words.filter(lambda x: x not in stopwords)

In [11]:
filtered_words = get_filtered_words(lowercase_words)
top_40_filtered = compute_top_k(filtered_words, 40)

[('i', 11044), ('not', 8141), ('you', 6354), ('have', 5146), ('no', 3620), ('one', 3498), ('like', 2253), ('more', 2087), ('out', 2021), ('up', 1831), ('man', 1783), ('now', 1579), ('only', 1555), ('must', 1523), ('little', 1485), ('those', 1447), ('good', 1444), ('should', 1417), ('after', 1379), ('great', 1358), ('every', 1356), ('first', 1318), ('own', 1289), ('did', 1271), ('how', 1266), ('see', 1251), ('these', 1244), ('men', 1233), ('over', 1209), ('where', 1205), ('make', 1196), ('upon', 1188), ('nor', 1181), ('never', 1177), ('much', 1167), ('time', 1166), ('said,', 1163), ('two', 1142), ('old', 1140), ('made', 1128)]


In [12]:
create_json_file(top_40_filtered, "sp2.json")

## Subproject 3
### Top 40 words without punctuations
- Find top 40 words
- Remove leading and trailing punctuation marks
- Filter out the stopwords from the handout file.

In [13]:
from string import punctuation

def puntuation_stripper(word):
    all_punctuations = ".,:;'!?"
    word = word[1:] if word[0] in all_punctuations else word
    word = word[:-1] if word[-1] in all_punctuations else word
    return word

def get_words_without_punctuations(words):
#     all_punctuations = punctuation + "—"
    all_punctuations = ".,:;'!?"
    return words.map(puntuation_stripper)

In [14]:
long_words = filtered_words.filter(lambda x: len(x) > 1)
words_without_punctuations = get_words_without_punctuations(long_words)

filtered_words = get_filtered_words(words_without_punctuations)
top_40_without_punctuations = compute_top_k(filtered_words, 40)

[('not', 8659), ('you', 7245), ('have', 5256), ('one', 3958), ('no', 3883), ('man', 2602), ('more', 2403), ('like', 2378), ('out', 2313), ('up', 2190), ('now', 2106), ('men', 1855), ('good', 1832), ('mr', 1788), ('only', 1704), ('time', 1697), ('god', 1664), ('first', 1646), ('say', 1611), ('must', 1571), ('little', 1556), ('own', 1528), ('those', 1514), ('see', 1494), ('after', 1437), ('great', 1436), ('should', 1429), ('did', 1400), ('us', 1377), ('these', 1366), ('every', 1361), ('over', 1341), ('before', 1341), ('know', 1334), ('how', 1314), ('much', 1308), ('same', 1305), ('where', 1277), ('two', 1265), ('made', 1244)]


In [15]:
create_json_file(top_40_without_punctuations, "sp3.json")

## Subproject 4
### TF-IDF Calculation

In [16]:
def get_words_for_document(document):
    word_bag = db.read_text(document)
    lowercase_words = get_lowercase_words(word_bag)
    long_words = lowercase_words.filter(lambda x: len(x) > 1)
    words_without_punctuations = get_words_without_punctuations(long_words)
    return get_filtered_words(words_without_punctuations)    

In [17]:
import math

def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], math.log(8/x[1])))
    return idf

In [18]:
def get_top5_TF_IDF(tf, idf):
    tf = tf.frequencies()
    tf_idf = tf.join(idf, lambda x: x[0], lambda x: x[0]).map(lambda x: (x[0][0], x[0][1] * x[1][1]))
    return tf_idf.topk(5, key=1).compute()

In [None]:
book_terms = []
for book in booklist:
    book_terms.append(get_words_for_document(book))

idf = get_IDF(book_terms)
tf_idf = []
for terms in book_terms:
    tf_idf.extend(get_top5_TF_IDF(terms, idf))

print(tf_idf)

In [None]:
create_json_file(tf_idf, "sp4.json")