### Sample program for clustering of documents with doc2vec  
- Consider Bi-gram  
- Apply preprocess_string to content strings

In [208]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import preprocess_string
from gensim.models.phrases import Phrases, Phraser
from sklearn.cluster import KMeans

In [209]:
pd.options.display.max_columns=999
pd.options.display.max_rows=999
pd.options.display.max_colwidth=999

seedval = 97
import os
os.environ['PYTHONHASHSEED'] = '97'

**(1)**  

In [210]:
df = pd.DataFrame([["A goal kick is a method of restarting play."],
                   ["There is no offside offence if a player receives the ball directly from a goal kick."],              
                   ["If the ball enters the opponents’ goal directly from a throw-in, the referee must award a goal kick."],
                   ["If an indirect free kick is kicked directly into the opponents’ goal, a goal kick is awarded"],
                  ], columns=['content'])
display(df)
n_docs = df.shape[0]
print(n_docs)

Unnamed: 0,content
0,A goal kick is a method of restarting play.
1,There is no offside offence if a player receives the ball directly from a goal kick.
2,"If the ball enters the opponents’ goal directly from a throw-in, the referee must award a goal kick."
3,"If an indirect free kick is kicked directly into the opponents’ goal, a goal kick is awarded"


4


**(2)**  

In [211]:
df['content'] = df['content'].map(preprocess_string)
display(df)

Unnamed: 0,content
0,"[goal, kick, method, restart, plai]"
1,"[offsid, offenc, player, receiv, ball, directli, goal, kick]"
2,"[ball, enter, opponents’, goal, directli, throw, refere, award, goal, kick]"
3,"[indirect, free, kick, kick, directli, opponents’, goal, goal, kick, award]"


**(3)**  

In [212]:
words = []
for i in range(n_docs):
    words.append(df.at[i, 'content'])
print(words)

[['goal', 'kick', 'method', 'restart', 'plai'], ['offsid', 'offenc', 'player', 'receiv', 'ball', 'directli', 'goal', 'kick'], ['ball', 'enter', 'opponents’', 'goal', 'directli', 'throw', 'refere', 'award', 'goal', 'kick'], ['indirect', 'free', 'kick', 'kick', 'directli', 'opponents’', 'goal', 'goal', 'kick', 'award']]


**(4)**  

In [213]:
phrases_bi = Phrases(words, min_count=2, threshold=1.0)
bigram = Phraser(phrases_bi)
df['content'] = df['content'].map(lambda x: bigram[x])
display(df.head())

Unnamed: 0,content
0,"[goal_kick, method, restart, plai]"
1,"[offsid, offenc, player, receiv, ball, directli, goal_kick]"
2,"[ball, enter, opponents’, goal, directli, throw, refere, award, goal_kick]"
3,"[indirect, free, kick, kick, directli, opponents’, goal, goal_kick, award]"


**(5)**  

In [214]:
docs = []
for i in range(n_docs):
    c = df.at[i, 'content']
    doc_id = i
    td = TaggedDocument(words=c, tags=[doc_id])
    docs.append(td)
display(docs)

[TaggedDocument(words=['goal_kick', 'method', 'restart', 'plai'], tags=[0]),
 TaggedDocument(words=['offsid', 'offenc', 'player', 'receiv', 'ball', 'directli', 'goal_kick'], tags=[1]),
 TaggedDocument(words=['ball', 'enter', 'opponents’', 'goal', 'directli', 'throw', 'refere', 'award', 'goal_kick'], tags=[2]),
 TaggedDocument(words=['indirect', 'free', 'kick', 'kick', 'directli', 'opponents’', 'goal', 'goal_kick', 'award'], tags=[3])]

**(6)**  

In [215]:
model = Doc2Vec(documents=docs, vector_size=20, min_count=1, dm=0)
print(model)
print(list(model.wv.vocab.keys()))

Doc2Vec(dbow,d20,n5,s0.001,t3)
['goal_kick', 'method', 'restart', 'plai', 'offsid', 'offenc', 'player', 'receiv', 'ball', 'directli', 'enter', 'opponents’', 'goal', 'throw', 'refere', 'award', 'indirect', 'free', 'kick']


**(7)**  

In [216]:
docvecs = model.docvecs.vectors_docs
print(docvecs.shape)

(4, 20)


**(8)**  

In [217]:
n_cls = 2
km = KMeans(n_clusters=n_cls, random_state=7)
cls = km.fit_predict(docvecs)
print(cls)

[0 0 1 0]
