In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle("results/test.pkl")

In [6]:
df.columns

Index(['availableLanguages', 'category', 'classification', 'fulltext',
       'identifier', 'keywords', 'lastUpdateDate', 'links', 'publicationDate',
       'referenceDocument', 'relatedProject', 'title', 'typesOfAction',
       'links2', 'entities', 'title_mentions', 'identifier_mentions', 'cites',
       'targets'],
      dtype='object')

In [7]:
from gensim import corpora
from gensim import corpora, models, similarities
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords



In [8]:
stop_en = set(stopwords.words('english'))

In [9]:
fulltexts = df['fulltext']
fulltexts = [" ".join(ft) for ft in fulltexts]

In [10]:
fulltexts = [ft.replace('\n  \n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n \n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n\n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n', ' ') for ft in fulltexts]

In [11]:
paragraphs = [ft.split('\x0c') for ft in fulltexts]

In [12]:
texts = [[word for word in word_tokenize(document.lower()) if word not in stop_en] for document in fulltexts]

In [13]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [14]:
dictionary = corpora.Dictionary(texts)

In [15]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [17]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]
lsi.show_topics(num_words=5, formatted=False)

[(0,
  [('...', 0.61644413290674382),
   ('\uf0b7', 0.1118639297854481),
   (',', 0.10644634615637719),
   ('proposal', 0.08886384581831272),
   ('.', 0.084396931600508415)]),
 (1,
  [('...', 0.63754850228715443),
   ('macumba', -0.29972494164930075),
   ('marine', -0.14727756562083127),
   ('marieke', -0.12749434319732539),
   ('event', -0.1009524753189549)]),
 (2,
  [('macumba', 0.46955655526478579),
   ('...', 0.35201241154900437),
   ('marieke', 0.20120369240726085),
   ('marine', 0.19943491630488466),
   ('microorganisms', 0.15304407999654762)]),
 (3,
  [('\uf0b7', -0.31297363495796787),
   ('data', 0.1235511923062431),
   ('...', 0.12186676493489078),
   ('proposed', -0.11928783969398601),
   ('entities', -0.11923009779346731)]),
 (4,
  [('eloise', -0.47243898273921658),
   ('zone', -0.30452177837299088),
   ('coastal', -0.29887299521916255),
   ('land-ocean', -0.23671485438182374),
   ('meeting', -0.20208615363433127)]),
 (5,
  [('entities', 0.20098870240594369),
   ('legal', 0.

In [18]:
lda = models.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary)
corpus_lda = lda[corpus_tfidf]
lda.show_topics(num_words=5, formatted=False)

[(0,
  [('...', 0.0020200847250818571),
   (',', 0.00069720796661292494),
   ('\uf0d8', 0.00065580924233177827),
   ('mexican', 0.00063446806738865109),
   ('seminar', 0.00059304720590154264)]),
 (1,
  [('...', 0.0021320462466845958),
   ('costs', 0.00062364488900400628),
   ('o', 0.00059464727287741707),
   ('taiwanese', 0.00054818783278509902),
   ('taiwan', 0.0005189471394285533)]),
 (2,
  [('•', 0.00074635044536136216),
   ('elixir', 0.00053997002911585943),
   ('egee', 0.00050053526531217119),
   ('kong', 0.00048165629284222455),
   ('macao', 0.00047236162433439931)]),
 (3,
  [('arctic', 0.00068874651347397477),
   ('seafood', 0.00063677973963893427),
   ('canadian', 0.00061532521820474042),
   ('\uf02d', 0.0005830944782433202),
   ('macumba', 0.00052876559030802994)]),
 (4,
  [('macumba', 0.00082676306566963455),
   ('bon', 0.0006168487659643278),
   ('chinese', 0.00053918291320204429),
   ('...', 0.00051479171433799567),
   ('conference', 0.00043971328148868826)]),
 (5,
  [('mac

In [19]:
from itertools import chain

In [20]:
sentences = list(chain.from_iterable([sent_tokenize(ft) for ft in fulltexts]))
sentences = [word_tokenize(s) for s in sentences]

In [21]:
bigram_transformer = models.Phrases(sentences, min_count=4)

In [22]:
w2vModel = models.Word2Vec(bigram_transformer[sentences], size=100, window=5, min_count=3, workers=4, iter=50, sg=1)



In [27]:
w2vModel.most_similar_cosmul(topn=20, positive=['open_science'])

[('schooling', 0.8006308078765869),
 ('ideas', 0.7867004871368408),
 ('EC’s', 0.7857959270477295),
 ('formal_technology', 0.7840580344200134),
 ('quadruple', 0.7803259491920471),
 ('helix', 0.7745307087898254),
 ('innovation_chain', 0.7730200290679932),
 ('skill', 0.768718957901001),
 ('capacity_building', 0.7649006843566895),
 ('behaviours', 0.7635020017623901),
 ('Will', 0.7614822387695312),
 ('Open_Science', 0.7600595355033875),
 ('MIDAS', 0.7586833834648132),
 ('reputation', 0.7558361291885376),
 ('audience', 0.7545291185379028),
 ('Conference', 0.7540575265884399),
 ('ELOISE', 0.7533842921257019),
 ('intentions', 0.7525784969329834),
 ('RRI', 0.7524858713150024),
 ('widening', 0.7522560954093933)]

In [24]:
w2vModel.n_similarity(['open','science'], ['open','research'])

0.8502117656194651