### Define Documents

In [1]:
df = pd.read_csv('../../pico_vectors/preprocess/pico_nonoverlapping_tokenized.csv')
docs = df.abstract.ix[:1000]

docs

0       in recent years the treatment of primary noctu...
1       monosymptomatic nocturnal enuresis is common i...
2       forty-five children aged qqq years with primar...
3       a double blind crossover trial of qqq microgra...
4       to measure the effect of intranasal desmopress...
5       the purpose of this study was to determine the...
6       the effect of qqq micrograms . desaminocystein...
7       the objective of this multicenter randomized c...
8       previous studies have suggested changes in sel...
9       desmopressin nasal spray has proved to be effi...
10      the response of desamino-d-arginine vasopressi...
11      the combination of desmopressin ( ddavp ) and ...
12      we tested the role of the bladder in the patho...
13      there are many therapeutic options against enu...
14      we evaluated the combination of alarm and desm...
15      we evaluated the efficacy and safety of qqq or...
16      desmopressin acetate , a synthetic antidiureti...
17      the ef

### Define Generator That Yields Documents

In [2]:
from gensim.models.doc2vec import LabeledSentence

class DocumentIterator:
    """Class for yielding documents for use with doc2vec
    
    See usage below.
    
    """
    def __init__(self, texts):
        """Prepares `texts` for use with doc2vec
        
        Parameters
        ----------
        texts : list of strings where each string is a document
        
        Assumes `texts` has already been tokenized so calling `split()` on each text produces correct tokenization
        
        """
        self.texts = [text.split() for text in texts]
        
    def __iter__(self):
        for i, text in enumerate(self.texts):
            yield LabeledSentence(words=text, tags=[i])

### Define `doc2vec` Model

In [3]:
%%time

from gensim.models import Doc2Vec

doc_iterator = DocumentIterator(docs)
model = Doc2Vec(doc_iterator)

model

CPU times: user 4.41 s, sys: 47.2 ms, total: 4.46 s
Wall time: 4.49 s


### Estimate Vectors

In [4]:
%%time

model.train(doc_iterator)

CPU times: user 3.81 s, sys: 22.5 ms, total: 3.84 s
Wall time: 3.78 s


1196020

### Access Learned Document Vectors

In [10]:
model.docvecs[0].shape

(300,)

### Infer Held-Out Document Vectors

In [12]:
unseen_docs = df.abstract.ix[1000:1005]

unseen_docs

1000    a randomized , single-center , double-blind , ...
1001    the objective of this randomized , controlled ...
1002    a randomized , double blind clinical trial was...
1003    a randomized , parallel , examiner-blind clini...
1004    a qqq study was conducted to evaluate the toot...
1005    use of higher peroxide concentrations for prof...
Name: abstract, dtype: object

In [21]:
inferred_vecs = unseen_docs.map(lambda doc: model.infer_vector(doc))
inferred_vecs = np.array(inferred_vecs.tolist())

inferred_vecs

array([[ 0.07970984, -0.14096403,  0.11800314, ...,  0.35160595,
         0.69981235, -0.662099  ],
       [ 0.32674381, -0.35907531, -0.00300364, ...,  0.51928061,
         1.12972021, -1.05375862],
       [ 0.21505025, -0.16708635,  0.09556082, ...,  0.46429375,
         0.93233228, -0.78965753],
       [ 0.21153228, -0.21771871,  0.0860255 , ...,  0.4791835 ,
         0.89987493, -0.75414461],
       [ 0.18389668, -0.17064688,  0.10758507, ...,  0.47385469,
         0.80411059, -0.69640261],
       [ 0.24771205, -0.29678506,  0.07840282, ...,  0.46096483,
         1.06304634, -0.94310045]], dtype=float32)

In [22]:
inferred_vecs.shape

(6, 300)