In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle("results/test.pkl")

In [6]:
df.columns

Index(['availableLanguages', 'category', 'classification', 'fulltext',
       'identifier', 'keywords', 'lastUpdateDate', 'links', 'publicationDate',
       'referenceDocument', 'relatedProject', 'title', 'typesOfAction',
       'links2', 'entities', 'title_mentions', 'identifier_mentions', 'cites',
       'targets'],
      dtype='object')

In [28]:
from gensim import corpora
from gensim import corpora, models, similarities
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords

In [8]:
stop_en = set(stopwords.words('english'))

In [9]:
fulltexts = df['fulltext']
fulltexts = [" ".join(ft) for ft in fulltexts]

In [10]:
fulltexts = [ft.replace('\n  \n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n \n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n\n', '\n') for ft in fulltexts]
fulltexts = [ft.replace('\n', ' ') for ft in fulltexts]

In [11]:
paragraphs = [ft.split('\x0c') for ft in fulltexts]

In [29]:
texts = [[word for word in nltk.wordpunct_tokenize(document.lower()) if word not in stop_en] for document in fulltexts]

In [30]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [31]:
dictionary = corpora.Dictionary(texts)

In [32]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [33]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [34]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]
lsi.show_topics(num_words=5, formatted=False)

[(0,
  [('\uf0b7', 0.13126052013077624),
   ('.', 0.12462082921383286),
   (',', 0.11985459460860377),
   ('data', 0.10288781968707136),
   ('horizon', 0.101464791997896)]),
 (1,
  [('macumba', 0.57531005627455822),
   ('marine', 0.22720120006666869),
   ('macumbaproject', 0.21782464808876301),
   ('marieke', 0.21560031347444009),
   ('aquatt', 0.19523377957400459)]),
 (2,
  [('\uf0b7', 0.2767901706395805),
   ('data', -0.1404536446234233),
   ('macumba', 0.13263725954596187),
   ('cloud', -0.12630168320046375),
   ('\uf02d', 0.12304616448136306)]),
 (3,
  [('eloise', -0.47851637675382519),
   ('coastal', -0.28446229618499513),
   ('zone', -0.26391570334152964),
   ('ocean', -0.19342136096622711),
   ('meeting', -0.18020864474950773)]),
 (4,
  [('entities', 0.18625311199676245),
   ('eloise', -0.18582244217012667),
   ('\uf0b7', -0.15858054241735231),
   ('legal', 0.14506089486734439),
   ('proposal', -0.14011102050117205)]),
 (5,
  [('\uf0b7', -0.36546637225007272),
   ('helpdesk', 0.

In [35]:
lda = models.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary)
corpus_lda = lda[corpus_tfidf]
lda.show_topics(num_words=5, formatted=False)

[(0,
  [('costs', 0.00074410119802899982),
   ('biodiversity', 0.00070833330782532481),
   (',', 0.00067088568734501182),
   ('bon', 0.00065888854071757201),
   ('hnscicloud', 0.00063187492721236032)]),
 (1,
  [('shall', 0.00062430478057136395),
   ('mexican', 0.0006178280744962172),
   ('apc', 0.00061547173154362388),
   ('reds', 0.00060667308160117402),
   ('.', 0.00059962504180023635)]),
 (2,
  [('midas', 0.0007469386147397779),
   ('biodiversity', 0.00074245929646590608),
   ('seafood', 0.00066255603695516041),
   ('ghg', 0.00064843089518524502),
   ('data', 0.00061985995771790623)]),
 (3,
  [('macumba', 0.001521794808047128),
   ('trl', 0.00078310962911353442),
   ('marieke', 0.00077395188212482148),
   ('rri', 0.00075676844020191876),
   ('taiwan', 0.00075369685607518089)]),
 (4,
  [('\uf0b7', 0.00092085561240037503),
   ('2017', 0.00060529197493808143),
   ('infrastructures', 0.00059784954110667044),
   ('reds', 0.00053348535295046832),
   ('gdpr', 0.0005233419127213787)]),
 (5,

In [19]:
from itertools import chain

In [20]:
sentences = list(chain.from_iterable([sent_tokenize(ft) for ft in fulltexts]))
sentences = [word_tokenize(s) for s in sentences]

In [21]:
bigram_transformer = models.Phrases(sentences, min_count=4)

In [22]:
w2vModel = models.Word2Vec(bigram_transformer[sentences], size=100, window=5, min_count=3, workers=4, iter=50, sg=1)



In [27]:
w2vModel.most_similar_cosmul(topn=20, positive=['open_science'])

[('schooling', 0.8006308078765869),
 ('ideas', 0.7867004871368408),
 ('EC’s', 0.7857959270477295),
 ('formal_technology', 0.7840580344200134),
 ('quadruple', 0.7803259491920471),
 ('helix', 0.7745307087898254),
 ('innovation_chain', 0.7730200290679932),
 ('skill', 0.768718957901001),
 ('capacity_building', 0.7649006843566895),
 ('behaviours', 0.7635020017623901),
 ('Will', 0.7614822387695312),
 ('Open_Science', 0.7600595355033875),
 ('MIDAS', 0.7586833834648132),
 ('reputation', 0.7558361291885376),
 ('audience', 0.7545291185379028),
 ('Conference', 0.7540575265884399),
 ('ELOISE', 0.7533842921257019),
 ('intentions', 0.7525784969329834),
 ('RRI', 0.7524858713150024),
 ('widening', 0.7522560954093933)]

In [24]:
w2vModel.n_similarity(['open','science'], ['open','research'])

0.8502117656194651