### Sample program for Doc2Vec by gensim  

#### Import libraries  

In [10]:
import numpy as np
import pandas as pd
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

#### Parameters  

In [11]:
csv_in = 'newsgroups5-2.csv'

embed_size = 100
min_count = 5
model_file = 'doc2vec_newsgroups5-2.model'

#### Read CSV file  

In [12]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(2927, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   content       2927 non-null   object
 1   target        2927 non-null   int64 
 2   target_names  2927 non-null   object
dtypes: int64(1), object(2)
memory usage: 68.7+ KB
None


Unnamed: 0,content,target,target_names
0,"From article <C5owCB.n3p@world.std.com>, by t...",1,sci.space
1,In article <1r1eu1$4t@transfer.stratus.com> c...,3,talk.politics.guns
2,In article <1993Apr23.184732.1105@aio.jsc.nas...,1,sci.space
3,What is the EXACT entry (parameter and synta...,2,comp.windows.x
4,In article <1993Apr20.151818.4319@samba.oit.u...,0,rec.sport.hockey


#### Check the number of documents in each category  

In [26]:
print(df['target'].value_counts())
print(df.groupby(['target','target_names']).count())

0    600
4    595
1    593
2    593
3    546
Name: target, dtype: int64
                           content  docID
target target_names                      
0      rec.sport.hockey        600    600
1      sci.space               593    593
2      comp.windows.x          593    593
3      talk.politics.guns      546    546
4      sci.crypt               595    595


#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [14]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

Unnamed: 0,content,target,target_names,docID
0,"From article <C5owCB.n3p@world.std.com>, by t...",1,sci.space,d1000
1,In article <1r1eu1$4t@transfer.stratus.com> c...,3,talk.politics.guns,d3000
2,In article <1993Apr23.184732.1105@aio.jsc.nas...,1,sci.space,d1001
3,What is the EXACT entry (parameter and synta...,2,comp.windows.x,d2000
4,In article <1993Apr20.151818.4319@samba.oit.u...,0,rec.sport.hockey,d0


#### Calculation of Doc2Vec  

In [15]:
docs = []
for i in range(len(df)):
    c = df.at[i, 'content']
    doc_id = df.at[i, 'docID']
    # make TaggedDocument
    td = TaggedDocument(words=c.split(), tags=[doc_id])
    docs.append(td)

In [16]:
%%time

model = Doc2Vec(documents=docs, vector_size=embed_size,
                min_count=min_count, dm=0, epochs=20) # PV-DBOW
#model = Doc2Vec(documents=docs, vector_size=embed_size,
#                min_count=min_count, dm=1, epochs=20)  # PV-DM
model.save(model_file)
 
# If you want to read saved model
# model = Doc2Vec.load('saved_doc2vec.model')

CPU times: user 38.3 s, sys: 628 ms, total: 39 s
Wall time: 19.1 s


In [17]:
# show vectors for 'd0'
print(len(model.docvecs['d0']))
print(model.docvecs['d0'])

100
[-0.37661126 -0.07450546 -0.47666585 -0.12859029  0.0591739   0.666588
  0.7262975   0.02464759  0.6024782   0.20746313 -0.06813006 -0.57986104
  0.6085836  -0.76100177  0.52840996 -0.5870352  -0.56830823 -0.18069802
  0.12029023 -0.35603148  0.21328051  0.20123419  0.5635116  -0.23640211
  0.15766047 -0.12117846 -0.11898364  0.32813576  0.51085013 -0.909284
 -0.38562107 -0.2764377  -0.48819688  0.46417832  0.76088434  0.2592789
 -0.20791835  0.33964688 -0.26765227  0.42412695  0.49605897 -0.3403559
 -0.11098704  1.0164336   0.13919698 -0.18096957  0.27638832 -0.06128496
  0.20739323 -0.12256812  0.04105749  0.278833    0.1894799   0.1714155
  0.09460736  0.43144274  0.26755062  0.44871616  0.01298083  0.09536628
 -0.58270377 -0.89966625 -0.6203916   0.59038705 -0.16940452 -0.3392718
 -0.55265266  0.21697535  0.3681351  -0.23705108  0.05364915  0.8823749
 -0.00902858 -0.78093296 -0.339531    0.79057133 -0.49147198  0.11545987
 -0.28682217  0.47742596 -0.34997496  0.60384583  0.1507

In [18]:
# Show most similar document for d0 (target 0), d1000 (target 1)
max_colw = pd.options.display.max_colwidth
pd.options.display.max_colwidth = 1000

#for sample_doc in ['d0', 'd1000', 'd2000', 'd3000', 'd4000']:
for sample_doc in ['d0', 'd4000']:
    print('sample_doc:', sample_doc)
    display(df[ df['docID']==sample_doc ])
    for x in model.docvecs.most_similar(sample_doc, topn=3):
        print(x)
        display(df[ df['docID']==x[0] ])

pd.options.display.max_colwidth = max_colw

sample_doc: d0


Unnamed: 0,content,target,target_names,docID
4,"In article <1993Apr20.151818.4319@samba.oit.unc.edu> Scott.Marks@launchpad.unc.edu (Scott Marks)>>And of course, Mike Ramsey was (at one time) the captain in Buffalo prior to >>being traded to Pittsburgh. Currently, the Penguins have 3 former captains >>and 1 real captain (Lemieux) playing for them. They rotate the As during the >>season (and even the C while Mario was out). Even Troy Loney has worn the C >>for the Pens. > I think that Mike Foligno was the captain of the Sabres when he got traded to the Leafs. Also, wasnt Rick Vaive the captain of the Leafs when he got traded to Chicago (with Steve Thomas for Ed Olcyzk and someone). Speaking of the Leafs, I believe that Darryl Sittler was their captain (hed torn the ""C"" off his jersey but I think he re-claimed the captaincy later on) when he was traded to the Flyers. Oh yeah, of course, Gretzky was the captain of the Oilers before he was traded wasnt he? Gary",0,rec.sport.hockey,d0


('d399', 0.971803605556488)


Unnamed: 0,content,target,target_names,docID
1983,"In article <1993Apr20.113953.18879@jarvis.csri.toronto.edu> leunggm@odin.control.utoronto.ca (Gary Leung)>In article <1993Apr20.151818.4319@samba.oit.unc.edu> Scott.Marks@launchpad.unc.edu (Scott Marks)>>>And of course, Mike Ramsey was (at one time) the captain in Buffalo prior to >>>being traded to Pittsburgh. Currently, the Penguins have 3 former captains >>>and 1 real captain (Lemieux) playing for them. They rotate the As during the >>>season (and even the C while Mario was out). Even Troy Loney has worn the C >>>for the Pens. >> > >I think that Mike Foligno was the captain of the Sabres when he >got traded to the Leafs. Also, wasnt Rick Vaive the captain of >the Leafs when he got traded to Chicago (with Steve Thomas for >Ed Olcyzk and someone). Speaking of the Leafs, I believe that >Darryl Sittler was their captain (hed torn the ""C"" off his >jersey but I think he re-claimed the captaincy later on) when he >was traded to the Flyers. > >Oh yeah, of course, Gretzky was the cap...",0,rec.sport.hockey,d399


('d292', 0.8966962099075317)


Unnamed: 0,content,target,target_names,docID
1444,">And of course, Mike Ramsey was (at one time) the captain in Buffalo prior to >being traded to Pittsburgh. Currently, the Penguins have 3 former captains >and 1 real captain (Lemieux) playing for them. They rotate the As during the >season (and even the C while Mario was out). Even Troy Loney has worn the C >for the Pens. I had heard(perhaps incorrectly) that while Lemieux was out, noone wore a C on their jersey. The As took turns doing captain duties(whatever they are). Scott... scott.marks@launchpad.unc.edu scott.marks@launchpad.unc.edu -- The opinions expressed are not necessarily those of the University of North Carolina at Chapel Hill, the Campus Office for Information Technology, or the Experimental Bulletin Board Service.",0,rec.sport.hockey,d292


('d533', 0.892473578453064)


Unnamed: 0,content,target,target_names,docID
2603,"In article <1993Apr19.213015@IASTATE.EDU>, njdevils@IASTATE.EDU (Cire Y. Trehguad)|> Anna Matyas (am2x+@andrew.cmu.edu)|> : Michael Collingridge|> : >And, while we are on the subject, has a captain ever been traded, |> : >resigned, or been striped of his title during the season? Any other |> : >team captain trivia would be appreciated. |> ; |> : Wasnt Ron Francis captain of the Whalers when he was traded to |> : Pittsburgh? |> |> And Rick Tochett was the captain of the Flyers when traded to the Pens |> recently... |> And of course, Mike Ramsey was (at one time) the captain in Buffalo prior to being traded to Pittsburgh. Currently, the Penguins have 3 former captains and 1 real captain (Lemieux) playing for them. They rotate the As during the season (and even the C while Mario was out). Even Troy Loney has worn the C for the Pens. -Jay John W. Huber, Jr. - aka Jay | Penguins - 1991,1992 Stanley Cup Champions Software Engineering Institute | Pirates - 1990,1991,1992 ...",0,rec.sport.hockey,d533


sample_doc: d4000


Unnamed: 0,content,target,target_names,docID
6,"In article <1r466c$an3@news.intercon.com> amanda@intercon.com (Amanda Walker)>Agreed. Remember, I dont even think of Clipper as encryption in any real >sense--if I did, Id probably be a lot more annoyed about it. I agree with this assessment. Furthermore, its promotion as providing greater protection than bare voice is quite true, as far as it goes. However, the only way for it to fulfill its stated goal of letting LE wiretap ""terrorists and drug dealers"" is to restrict stronger techniques. Wiretap targets presently use strong encryption, weak encryption, or (the vast majority) no encryption. The latter two classes can be tapped. With weak encryption in every phone, the no-encryption class is merged into the weak-encryption class. Will the introduction of Clipper cause targets presently enjoying strong privacy to give up on it? that is, to rely for privacy on a system expressly designed to deny it to people like them? I doubt it. The mere introduction of this scheme ...",4,sci.crypt,d4000


('d4157', 0.7964264750480652)


Unnamed: 0,content,target,target_names,docID
778,"ebrandt@jarthur.claremont.edu (Eli Brandt)> Instead we have a deliberately brain-dead version of a cryptosystem > that has not even been peer reviewed. Yes, the NSA owns some smart > people. But if they pulled a FEAL, well, AT&T is going to be left with > a lot of dud phones on its hands. Agreed. Remember, I dont even think of Clipper as encryption in any real sense--if I did, Id probably be a lot more annoyed about it. > Heh heh. The government already gave it up for us. Remember in the > announcement they described this scheme as balancing the two > extremes of having no privacy and claiming that citizens had a > Constitutional right to encryption? Thats not for Clinton (or anyone under him) to say, though. Only the federal and supreme courts can say anything about the constitutionality. Anything the administration or any governmental agency says is opinion at best. Amanda Walker InterCon Systems Corporation",4,sci.crypt,d4157


('d4331', 0.7892059683799744)


Unnamed: 0,content,target,target_names,docID
1610,"uni@acs.bu.edu (Shaen Bernhardt)> I wish I could agree with you. Ask yourself this. Why would any > private sector entity wish to buy a crypto system that was KNOWN to be > at least partially compromised? (a) To use for sensitive but not strategically important traffic, (b) if the system was cheap. For example, I dont own a cordless phone. With Clipper, I would. If the local men in blue really want to listen to me talk to my friends or order pizza, Im no worse off than I am now, and I dont have to worry about local kids or nosy neighbors. That is to say, Clipper ""raises the bar"" on insecure channels. It doesnt make them secure, by any means, but a wall, even if the FBI can get a master key by court order, is still better than a ""keep off the grass"" sign. > The answer seems obvious to me, they wouldnt. There is other hardware > out there not compromised. DES as an example (triple DES as a better > one.) So, where can I buy a DES-encrypted cellular phone? How m...",4,sci.crypt,d4331


('d4466', 0.7609058618545532)


Unnamed: 0,content,target,target_names,docID
2283,"In article <ygoland.735123994@wright> \tygoland@wright.seas.ucla.edu (The Jester) >Ignoring for the moment the question of patented processes (such as >Public Keys), can the government stop me from using an encryption >process? Following precedent in other areas, the government is likely to put a tax on encryption technology. Once the tax is imposed, it becomes a federal matter and suspicision of an unlicensed cryptographic tool will bring the BATF or FBI tossing grenades into your house. (The BATF appears to be the logical agency to enforce suchand firearms is similary based on taxes.) Look at thebands that are supposed to be private. This has nothing to do with any desire to prevent harmful interference. If the government can make a radio receiver illegal what makes you think they wont claim the right to control encryption? -- John Carr (jfc@athena.mit.edu)",4,sci.crypt,d4466
