### Sample program for Doc2Vec by gensim  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

#### Parameters  

In [None]:
csv_in = 'newsgroups5-1.csv'

embed_size = 100
min_count = 5
model_file = 'doc2vec_newsgroups5-1.model'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Check the number of documents in each category  

In [None]:
print(df['target'].value_counts())

#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [None]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

#### Calculation of Doc2Vec  

In [None]:
docs = []
for i in range(len(df)):
    c = df.at[i, 'content']
    doc_id = df.at[i, 'docID']
    # make TaggedDocument
    td = TaggedDocument(words=c.split(), tags=[doc_id])
    docs.append(td)

In [None]:
%%time

model = Doc2Vec(documents=docs, vector_size=embed_size,
                min_count=min_count, dm=0, epochs=20) # PV-DBOW
#model = Doc2Vec(documents=docs, vector_size=embed_size,
#                min_count=min_count, dm=1, epochs=20)  # PV-DM
model.save(model_file)
 
# If you want to read saved model
# model = Doc2Vec.load('saved_doc2vec.model')

In [None]:
# show vectors for 'd0'
print(len(model.docvecs['d0']))
print(model.docvecs['d0'])

In [None]:
# Show most similar document for d0 (target 0), d1000 (target 1)
max_colw = pd.options.display.max_colwidth
pd.options.display.max_colwidth = 1000

#for sample_doc in ['d0', 'd1000', 'd2000', 'd3000', 'd4000']:
for sample_doc in ['d0', 'd4000']:
    print('sample_doc:', sample_doc)
    display(df[ df['docID']==sample_doc ])
    for x in model.docvecs.most_similar(sample_doc, topn=3):
        print(x)
        display(df[ df['docID']==x[0] ])

pd.options.display.max_colwidth = max_colw