### Sample program for clustering of documents with doc2vec  
- Consider Bi-gram  
- Apply preprocess_string to content strings

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.phrases import Phrases, Phraser
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#### Parameters  

In [None]:
csv_in = 'newsgroups5-2.csv'
min_count = 10
min_words = 50

embed_size = 300
model_file = 'doc2vec_newsgroups5-2.model'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Delete too short docs  

In [None]:
df = df[ df['content'].map(lambda x: len(x.split())) >= min_words ]
df = df.reset_index(drop=True)
print(df.shape)

#### Check the number of documents in each category  

In [None]:
print(df['target'].value_counts())

#### Remove stop_words, punctuations, etc.   

In [None]:
df['content'] = df['content'].map(preprocess_string)
display(df.head())

#### Detect Bi-gram  

In [None]:
words = []
for i in range(len(df)):
    #print(i, df.at[i, 'content'])
    words.append(df.at[i, 'content'])
    #print(len(words))  # debug
#print(words[:5])  # debug

In [None]:
%%time

phrases_bi = Phrases(words, min_count=30, threshold=10.0)
bigram = Phraser(phrases_bi)
df['content'] = df['content'].map(lambda x: bigram[x])
display(df.head())

##### Test of bigram  

In [None]:
print(bigram[ ['new', 'york'] ])

#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [None]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

#### Calculation of Doc2Vec  

In [None]:
docs = []
for i in range(len(df)):
    c = df.at[i, 'content']
    doc_id = df.at[i, 'docID']
    # make TaggedDocument
    td = TaggedDocument(words=c, tags=[doc_id])
    docs.append(td)

#### Calculation of doc vectors  

In [None]:
%%time

#model = Doc2Vec(documents=docs, vector_size=embed_size,
#                min_count=min_count, dm=0, epochs=20) # PV-DBOW
model = Doc2Vec(documents=docs, vector_size=embed_size,
                min_count=min_count, dm=1, epochs=20)  # PV-DM
print(model)

model.init_sims(replace=True)
model.save(model_file)
 
# If you want to read saved model
# model = Doc2Vec.load('doc2vec.model')

##### Check word set  

In [None]:
print(len(model.wv.vocab.keys()))  # number of words
print(list(model.wv.vocab.keys())[:10])  # show first 10 words

In [None]:
docvecs = model.docvecs.vectors_docs
print(docvecs.shape)

#### Elbow method to determine the number of clusters  

In [None]:
%%time

max_cls = 7
distortions = []
for i in range(1, max_cls+1):
    print(i)
    km = KMeans(n_clusters=i)
    km.fit(docvecs)
    distortions.append(km.inertia_)
plt.plot(range(1, max_cls+1), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

#### K-Means clustering  

In [None]:
n_cls = 5
km = KMeans(n_clusters=n_cls, random_state=7)
cls = km.fit_predict(docvecs)

#### Check correspondence of target and clusters  

In [None]:
display(pd.crosstab(df['target'], cls))

#### Visualization using PCA  

In [None]:
%%time

pca = PCA(n_components=2)
Y_pca = pca.fit_transform(docvecs)

In [None]:
plt.title("PCA (colored by cluster ID)")
marker = '.'
for i in range(n_cls):
    y1 = Y_pca[ cls==i ]
    plt.scatter(y1[:, 0], y1[:, 1], marker=marker, label=i)

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend()
plt.show()

#### Plot of PCA colored by target  

In [None]:
#plt.figure(figsize=(10,7))
plt.title("PCA (colored by target)")
marker = '.'
for i in range(n_cls):
    y1 = Y_pca[ df['target']==i ]
    plt.scatter(y1[:, 0], y1[:, 1], marker=marker, label=i)

plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend()
plt.show()

#### Visualization using t-SNE  

In [None]:
%%time

Y_tsne = TSNE(n_components=2,
              perplexity=30, n_iter=500,
              random_state=0).fit_transform(docvecs)

In [None]:
#plt.figure(figsize=(10,7))
plt.title("t-SNE (colored by cluster ID)")
marker = '.'
for i in range(n_cls):
    y1 = Y_tsne[ cls==i ]
    plt.scatter(y1[:, 0], y1[:, 1], marker=marker, label=i)
    
plt.xlabel('t-SNE1')
plt.ylabel('t-SNE2')
plt.legend()
plt.show()

#### Plot of t-SNE colored by target  

In [None]:
#plt.figure(figsize=(10,7))
plt.title("t-SNE (colord by target)")
marker = '.'
for i in range(n_cls):
    y1 = Y_tsne[ df['target']==i ]
    plt.scatter(y1[:, 0], y1[:, 1], marker=marker, label=i)
    
plt.xlabel('t-SNE1')
plt.ylabel('t-SNE2')
plt.legend()
plt.show()