# Doc2Vec With GenSim

Or, creating fixed size vectors from arbitrary length paragraphs.

In [13]:
import nltk
import random
from random import shuffle
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
if not nltk.download('reuters', download_dir="./") or not nltk.download('punkt', download_dir="./"):
    print("Download Failed")

[nltk_data] Downloading package reuters to ./...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to ./...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
seed = 42
random.seed(seed)

## Exploring The Reuters Dataset

In [4]:
article_index = 2

# The ID for the article we are looking at.
article_id = reuters.fileids()[article_index]

# Get the article as a list of words (and punctuation)
article_text = reuters.raw(article_id)

# Get the tags associated with the article.
article_tags = reuters.categories(article_id)

print("ID:", article_id)
print("\nTags:", ", ".join(article_tags))
print("\nArticle:", article_text)

ID: test/14829

Tags: crude, nat-gas

Article: JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's 

## Creating The Training Dataset

In [5]:
tagged_documents = []

for i, file_id in enumerate(reuters.fileids()):
    document_words = reuters.raw(file_id)
    document_tags = reuters.categories(file_id)
    tagged_document = TaggedDocument(words=word_tokenize(document_words),
                                     tags=document_tags)
    tagged_documents.append(tagged_document)
    
shuffle(tagged_documents)

## Create The Doc2Vec Model

In [9]:
doc2vec_dimensions    = 300
ignore_word_count     = 2
amount_doc2vec_epochs = 100
amount_train_threads  = 12
doc2vec_location      = "./doc2vec_model.bin"

doc2vec = Doc2Vec(vector_size=doc2vec_dimensions, 
                  min_count=ignore_word_count, 
                  seed=seed,
                  workers=amount_train_threads)

In [10]:
doc2vec.build_vocab(tagged_documents)

In [11]:
doc2vec.train(tagged_documents, epochs=amount_doc2vec_epochs, total_examples=doc2vec.corpus_count)

In [12]:
doc2vec.save(doc2vec_location)

## Create the Keras Classifier

In [None]:
def get_mlp(input_size, 
            hidden_sizes, 
            output_size, 
            amount_dropout, 
            activation='relu'):
    
    model = Sequential()
    
    # Add input to first hidden layer.
    model.add(Dense(input_dim=input_size, 
                    output_dim=hidden_sizes[0],
                    activation=activation))
    model.add(Dropout(amount_dropout))
    
    # Loop over rest of hidden layers.
    for i in range(1, len(hidden_sizes) - 1):
        model.add(Dense(output_dim=hidden_sizes[i],
                        activation=activation))
        model.add(Dropout(amount_dropout))
        
    # Add output layer.
    model.add(Dense(output_dim=output_size))
    return model


model = 