# Implementation

## Bag-Of-Words

In [1]:
from glob import glob
import os

import numpy as np
import gensim
import nltk

from settings import project_root
from lib.topic_models import vector_models

In [2]:
# For debugging
import logging

logger = logging.getLogger('text_similar')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)


In [3]:
cwd = project_root

data_folder = os.path.join(cwd, 'tmp', 'testFiles', '*')

tmp_folder = os.path.join(cwd, 'tmp', 'modeldir')

In [4]:
vectors = vector_models.VectorModels(data_folder, tmp_folder)

corpus, dictionary = vectors.load_corpus()

In [15]:
npcorpus = gensim.matutils.corpus2dense(corpus=corpus, num_terms=len(dictionary), num_docs=len(corpus))

In [16]:
npcorpus.shape

(19792, 10001)

In [7]:
npcorpus = np.delete(npcorpus,1,axis=1)

In [9]:
npcorpus = npcorpus.T

In [16]:
from sklearn.cluster import MiniBatchKMeans

In [14]:
k_means = MiniBatchKMeans(n_clusters=3)
for item in np.array_split(npcorpus,100, axis=0):
    k_means.partial_fit(item)

In [24]:
labels = k_means.predict(npcorpus.T)

In [25]:
print labels.shape

(10001,)


In [17]:
# Initialization
k_means = MiniBatchKMeans(n_clusters=3)
c_size = 1000

# Training
for item in gensim.utils.chunkize(corpus, chunksize=c_size, maxsize=3):
    npchunk = gensim.matutils.corpus2dense(corpus=item, num_terms=len(dictionary), num_docs=len(item))
    if npchunk.shape[1] == c_size:
        k_means.partial_fit(npchunk.T)
    else:
        continue

# Prediction
labels = np.array([])
for item in gensim.utils.chunkize(corpus, chunksize=c_size, maxsize=3):
    npchunk = gensim.matutils.corpus2dense(corpus=item, num_terms=len(dictionary), num_docs=len(item))
    labels = np.concatenate((labels,k_means.predict(npchunk.T)))

In [27]:
labels = np.array([])
for item in gensim.utils.chunkize(corpus, chunksize=c_size, maxsize=3):
    npchunk = gensim.matutils.corpus2dense(corpus=item, num_terms=len(dictionary), num_docs=len(item))
    labels = np.concatenate((labels,k_means.predict(npchunk.T)))

In [29]:
print labels

[ 0.  2.  2. ...,  2.  2.  2.]
