In [1]:
import dask.bag as db
import dask.array as da
import dask.dataframe as df
import math
import pandas as pd
import numpy as np
import requests

In [2]:
from dask.distributed import Client
client = Client()  # set up local cluster on your laptop
client

0,1
Client  Scheduler: tcp://127.0.0.1:52337  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 16.90 GB


Helper function that displays all the functions that can be invoked on an object.

In [3]:
def display_object_method(obj):
    object_methods = [method_name for method_name in dir(obj)
                  if callable(getattr(obj, method_name))]
    print(object_methods)

In [4]:
def get_split_words(word_bag):    
    split_words = word_bag.str.split()
    stripped_words = split_words.flatten().map(lambda x: x.strip())
    word_array = stripped_words.filter(lambda x: x!= "")
    return word_array

In [5]:
def get_filtered_words(words):
    stopwords = open("data/stopwords.txt", "r").read().split("\n")
    return words.filter(lambda x: x not in stopwords)

In [6]:
def get_words_for_document(document):
    word_bag = db.read_text(document)
    split_words = get_split_words(word_bag)
    long_words = split_words.filter(lambda x: len(x) < 3)
#     words_without_punctuations = long_words.map(puntuation_stripper)
    return get_filtered_words(long_words)    
#     return long_words

In [7]:
def get_IDF(words_by_document):
    unique_words = []
    for words_for_single_document in words_by_document:
        unique_words.append(words_for_single_document.distinct())
    large_bag = db.concat(unique_words)
    frequencies = large_bag.frequencies()
    idf = frequencies.map(lambda x: (x[0], round(math.log((len(words_by_document) + 1)/x[1]), 10)))
    return idf

In [8]:
def get_TF_IDF(tf, idf):
    tf = np.asarray(tf.frequencies().compute())
    tf_idf = {}
    for idf_item in idf: 
        col_name = str(int(idf_item[0], 16))
        tf_idf[col_name] = idf_item[1]
        for i in range(len(tf)):
            tf_item = tf[i]
            if idf_item[0] == tf_item[0]:
                tf_idf[col_name] = float(idf_item[1]) * float(tf_item[1])
                np.delete(tf, i)
                break
    return tf_idf

In [9]:
set_name = "X_small_train.txt"
# set_name = "X_train.txt"
url = "data/train/" + set_name
url = 'https://storage.googleapis.com/uga-dsp/project1/files/' + set_name
file_names = pd.read_csv(url, header=None)[0].to_numpy()

In [10]:
all_file_words = []
for file_name in file_names:
    file_name = "data/train/" + file_name + ".bytes"
    file_name = "https://storage.googleapis.com/uga-dsp/project1/data/bytes/" + file_name + ".bytes"
    
    all_file_words.append(get_words_for_document(file_name))
idf = get_IDF(all_file_words)

In [11]:
idf = np.asarray(idf.topk(256, key=0).compute())

In [12]:
cols = np.arange(256).astype(str)
pd_df = pd.DataFrame(columns=cols)

tf_idf_df = df.from_pandas(pd_df, npartitions=1)

for file_words in all_file_words:
    tf_idf_bag_array = get_TF_IDF(file_words, idf)
    tf_idf_pandas = pd.DataFrame(tf_idf_bag_array, index=[0])
    pandas_to_dask = df.from_pandas(tf_idf_pandas, npartitions=1)
    tf_idf_df = tf_idf_df.append(pandas_to_dask)

In [13]:
final_dataframe = tf_idf_df.compute()

In [14]:
final_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
0,19.017138529,11.403720393,10.664932399,10.406356601,10.174166088,10.000023204,5.0935470108,10.485512457,5.309620914,10.453850115,...,10.480235400,10.844352340,11.419551564,16.018577971,10.812689997,10.269153116,10.828521169,10.332477801,10.459127172,6.758897094
0,216091.74510,63073.977493,34469.061513,34757.231047,34459.900540,31780.073742,16951.324451,34235.198172,17442.104702,34570.882330,...,32687.854212,33660.869663,34772.534512,52524.917166,34730.360270,32429.985540,33200.245904,33735.540020,32862.577574,46866.192449
0,1944825.7059,63073.977493,34469.061513,34757.231047,34459.900540,31780.073742,16951.324451,34235.198172,17442.104702,34570.882330,...,32687.854212,33660.869663,34772.534512,52524.917166,34730.360270,32429.985540,33200.245904,33735.540020,32862.577574,46866.192449
0,10116983322.0,5045918.1994,1137479.0299,1216503.0866,1137176.7178,826281.91729,135610.59560,753174.35978,1622115.7372,172854.41165,...,915259.91793,437591.30561,486815.48316,840398.67465,451494.68350,518879.76864,1361210.0820,640975.26038,953014.74964,32665736.136
0,492474514148,16101524974.0,1647069635.2,2113065861.4,2555236084.8,566829395.26,92350815.603,466214928.70,2559698633.3,94378508.760,...,518952373.46,256428505.08,452738399.33,381540998.29,202269618.20,216372863.52,956930687.64,276260337.22,551795540.04,165909273834
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,734507221140,430176408872,232433883550,2548829.7880,861993661724,139244349784,281348503313,285658243323,753255949827,501818689308,...,201775407343,670858053100,162352603868,128961915635,522338985001,948511054415,309047557450,211343945234,104258725352,180951414812
0,616618812147,180975215212,722869377840,8298989789.7,264201057318,429708063433,872180360270,897252542277,240740601564,155864884899,...,617432746469,206087593912,490629568889,410356815550,162499658233,293469320236,950321239158,648191880032,318093371048,925566486763
0,2.8906473294,285289329260,3.9773935768,910150210236,325971264518,441653947596,1.0006525273,966161537523,257929480515,462918708150,...,222275788728,114996877402,563684311696,137059176393,339624285706,440203980354,1.1170075845,137416678566,284788995099,5.3933684750
0,74483.309736,196564347860,902.86834193,324923625054,798629598069,130287914540,161.10505689,202893922879,500383192199,661973752654,...,955785891530,554284949077,523099041253,955302459459,236378502851,332354005167,996.37076537,762662566041,168879874093,18650.268186
