In [10]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='apply')
from scipy import sparse
import dill
import re
import os
import sys

In [11]:
def dense_row_to_sparse(dense, width):
    data = []
    column = []
    row = []
    for col, val in dense:
        row.append(0)
        column.append(col)
        data.append(val)
    output = sparse.csr_matrix((data, (row, column)), shape=(1, width))
    return output

def dense_to_sparse(dense, width):
    depth = len(dense)
    data = []
    column = []
    row = []
    for i in tqdm(range(depth), desc='encode row'):
        for j in dense[i]:
            row.append(i)
            column.append(j[0])
            data.append(j[1])
    output = sparse.csr_matrix((data, (row, column)), shape=(depth, width))
    return output

def _get_doc2bow(row, dictionary):
    txt = row.split()
    dense = dictionary.doc2bow(txt)
    return dense

def get_sparse_doc_term_matrix(files, dictionary, column='title', save=False, filename='transforms/sparse_doc_term_matrix.pkd'):
    flag = True
    for file in tqdm(files, desc='feathers'):
        df = pd.read_feather(file, columns=[column])
        dense_m = df.progress_apply(lambda x: _get_doc2bow(row=x[column], dictionary=dictionary), axis=1)
        if flag:
            sparse_m = dense_to_sparse(dense_m, width=len(dictionary))
            flag = False
        else:
            sparse_m = sparse.vstack([sparse_m, dense_to_sparse(dense_m, width=len(dictionary))])
        del df
        del dense_m
    if save:
        with open(filename, 'wb') as file:
            dill.dump(sparse_m, file)
    return sparse_m

In [12]:
%%time

folder = 'cleaned_cache'
files = [os.path.join(folder, 'data_{}.feather'.format(i)) for i in range(len(os.listdir(folder)))]

with open('transforms/dictionary.pkd', 'rb') as file:
    d = dill.load(file)
d.filter_extremes(keep_n=5000)

doc_term_matrix = get_sparse_doc_term_matrix(files, dictionary=d, save=True)

HBox(children=(IntProgress(value=0, description='feathers', max=41, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='apply', max=2683916, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2683916, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2581474, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2581474, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1494096, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1494096, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2920704, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2920704, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2829413, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2829413, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=335030, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='encode row', max=335030, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='apply', max=1637789, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1637789, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2774958, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2774958, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1710598, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1710598, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2867479, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2867479, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2581474, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2581474, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2667549, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2667549, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1714149, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1714149, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2567327, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2567327, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2645516, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2645516, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2181246, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2181246, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1601517, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1601517, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1487397, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1487397, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=335030, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='encode row', max=335030, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='apply', max=2567327, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2567327, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1361631, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1361631, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2383862, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2383862, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2771874, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2771874, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2388760, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2388760, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=879895, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='encode row', max=879895, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='apply', max=2197294, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2197294, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2375352, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2375352, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2683916, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2683916, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2555366, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2555366, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2555649, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2555649, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=727684, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='encode row', max=727684, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='apply', max=373657, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='encode row', max=373657, style=ProgressStyle(description_widt…

HBox(children=(IntProgress(value=0, description='apply', max=2555366, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2555366, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1710598, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1710598, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2503773, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2503773, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1077185, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1077185, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2645286, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2645286, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2632243, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2632243, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=1098258, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=1098258, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2602043, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2602043, style=ProgressStyle(description_wid…

HBox(children=(IntProgress(value=0, description='apply', max=2375352, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='encode row', max=2375352, style=ProgressStyle(description_wid…

CPU times: user 1h 23min 28s, sys: 4min 37s, total: 1h 28min 6s
Wall time: 1h 29min 12s


In [27]:
doc_term_matrix

<84639033x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 336789133 stored elements in Compressed Sparse Row format>