In [111]:
import dask.bag as db
import dask.array as da
import dask.dataframe as df
import math
import pandas as pd
import numpy as np
import requests

Helper function that displays all the functions that can be invoked on an object.

In [112]:
def display_object_method(obj):
    object_methods = [method_name for method_name in dir(obj)
                  if callable(getattr(obj, method_name))]
    print(object_methods)

In [113]:
def get_split_words(word_bag):    
    split_words = word_bag.str.split()
    stripped_words = split_words.flatten().map(lambda x: x.strip())
    word_array = stripped_words.filter(lambda x: x!= "")
    return word_array

In [114]:
def get_filtered_words(words):
    stopwords = open("data/stopwords.txt", "r").read().split("\n")
    return words.filter(lambda x: x not in stopwords)

In [115]:
def get_words_for_document(document):
    word_bag = db.read_text(document)
    split_words = get_split_words(word_bag)
    long_words = split_words.filter(lambda x: len(x) < 3)
#     words_without_punctuations = long_words.map(puntuation_stripper)
    return get_filtered_words(long_words)    
#     return long_words

In [116]:
def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], math.log((len(words_by_document) + 1)/x[1])))
    return idf

In [117]:
def get_TF_IDF(tf, idf):
    tf = tf.frequencies()
    joined_sets = idf.join(tf, lambda x: x[0], lambda x: x[0])
    computed_tf_idf = joined_sets.map(lambda x: (x[0][0], x[0][1] * x[1][1], 0))
    concated = db.concat([idf, computed_tf_idf])
    folded = concated.foldby(lambda x: x[0], lambda x,y: x[1] if len(x) == 3 else y[1])
    array_format = folded.map(lambda x: x[1][1] if isinstance(x[1], tuple) else x[1])
    return array_format

In [118]:
set_name = "X_small_train.txt"
# set_name = "X_train.txt"
# url = 'https://storage.googleapis.com/uga-dsp/project1/files/' + set_name
url = "data/train/" + set_name
file_names = pd.read_csv(url, header=None)[0].to_numpy()

In [119]:
all_file_words = []
for file_name in file_names:
#     file_name = "https://storage.googleapis.com/uga-dsp/project1/data/bytes/" + file_name + ".bytes"
    file_name = "data/train/" + file_name + ".bytes"
    all_file_words.append(get_words_for_document(file_name))
idf = get_IDF(all_file_words)

In [120]:
idf = idf.topk(256, key=0)

In [None]:
cols = np.arange(256).astype(str)
pd_df = pd.DataFrame(columns=cols)

tf_idf_df = df.from_pandas(pd_df, npartitions=1)

for file_words in all_file_words:
    tf_idf_bag_array = get_TF_IDF(file_words, idf)
    tf_idf_pandas = pd.DataFrame(data=[tf_idf_array], columns=cols)
    pandas_to_dask = df.from_pandas(tf_idf_pandas, npartitions=1)
    tf_idf_df = tf_idf_df.append(pandas_to_dask)

In [109]:
final_dataframe = tf_idf_df.compute()

In [110]:
final_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.287682,0.693147,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,2.589139
0,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.287682,0.693147,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,2.589139
0,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,0.693147,...,0.693147,0.287682,0.693147,0.287682,0.693147,0.693147,0.693147,0.693147,0.693147,2.589139
