In [None]:
test_doc_id = 'ENCPOS_2002_29'

# Loading the dataset (`docs_structured`)

In [None]:
#download, unzip and reading encpos dataset
import glob

!wget https://github.com/chartes/encpos_similarities/raw/master/data/encpos_txt.zip
#!wget https://github.com/chartes/encpos_similarities/raw/master/data/encpos_sample_txt.zip
!unzip encpos_txt.zip -d /content/

docs = [] # list of documents (a single line string for each doc)
docs=[open(filename, "r").readlines() for filename in glob.glob("/content/encpos_txt/*.txt")]
docs=[" ".join(x).replace("\n", "") for x in docs]

In [None]:
# Extracting chapters structuration from texts

import re

docs_structured={} # dict of docs to process ('metadata' + one item for each text div)
for index, doc in enumerate(docs):
  chapters=re.findall(r"==.+?\==" , doc) #match chapters (and subchaters) titles as the pattern is "===" and "=="
  regexPattern = '|'.join(map(re.escape, chapters))
  a=re.split(regexPattern, doc)

  text_structured={x:y for x,y in list(zip(["metadata"]+chapters, a))}
  try:
    identifier=text_structured["metadata"].split("identifier: ")[1].split(" ", 1)[0].replace(" ", "")
    docs_structured[identifier]=text_structured
  except:
    #print(text_structured["metadata"])
    identifier=doc.split("identifier: ")[1].split(" ",1)[0]
    resume=doc.split("title: ")[1]
    docs_structured[identifier]={"title":resume}
    #print(texts[index], "\n")

In [None]:
# test
docs_structured[test_doc_id]['metadata']
docs_structured[test_doc_id]['== Conclusion ==']

'  L’abbaye de Fontenay, grandie sous le regard de saint Bernard, s’est dotée, par une activité de copie d’abord externe, puis dans l’abbaye même, d’une bibliothèque répondant aux exigences ascétiques et spirituelles de l’abbé de Clairvaux. Mais l’austérité polychrome a été remplacée par un monochromatisme splendide qui respectait plus la lettre que l’esprit de pauvreté tel qu’il a été formulé par saint Bernard. Le temps de la production des livres est assez bref et celui des réalisations monochromes encore plus. La production importante dans la seconde moitié du xiie siècle est poursuivie par un enrichissement sporadique fondé sur la production laïque, les dons des moines et leurs acquisitions à Paris. L’abbaye de Fontenay comptait probablement cent soixante-dix à deux cents manuscrits au xiiie siècle et près de cinq cents volumes à la fin du xve siècle ou au début du xvie siècle. Malgré une importance tout à fait honorable, l’abbaye ne semble pourtant pas avoir eu de rôle majeur dans

# Corpus preprocessing

## Sentences segmentation (`docs_structured_sents`)

#### 1. Télécharger le corpus segmenté en phrases

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_structured_sents.json

docs_structured_sents = {}  # dict of docs (with their 'metadata' + one item for each text div with a list of its sentences)

with open('/content/encpos_structured_sents.json') as json_file:
  docs_structured_sents = json.load(json_file)

# docs_structured_sents.keys()
# docs_structured_sents['ENCPOS_2002_29']['metadata']

--2022-07-20 11:57:54--  https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_structured_sents.json
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_structured_sents.json [following]
--2022-07-20 11:57:54--  https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_structured_sents.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49706859 (47M) [text/plain]
Saving to: ‘encpos_structured_sents.json’


2022-07-20 11:57:55 (204 MB/s) - ‘encpos_structured_sents.json’ saved [4970

#### 2. (OU) Segmenter le corpus en phrases

In [None]:
#Option 2: extracting chapter structuration with sentences split (it can take a while)

# Spacy required for senteces segmentation
!pip install -U spacy[cuda92,transformers,lookups] # tout est requis ici ?
!python -m spacy download fr_core_news_lg
import spacy
nlp=spacy.load("fr_core_news_lg") #spacy linguistic model for french news large

docs_structured_sents={} # dict of docs ('metadata' + one item for each text div with a list of its sentences)
for index, doc in enumerate(docs):
  chapters=re.findall(r"==.+?\==" , doc) #chapters and subchaters as the pattern is "===" and "=="
  regexPattern = '|'.join(map(re.escape, chapters))
  a=re.split(regexPattern, doc)

  text_structured={x:y for x,y in list(zip(["metadata"]+chapters, a))}
  try:
    identifier=text_structured["metadata"].split("identifier: ")[1].split(" ", 1)[0].replace(" ", "")
    text_structured={k:[sent.text for sent in nlp(v).sents] for k,v in text_structured.items()}
    docs_structured_sents[identifier]=text_structured
  except:
    #print(text_structured["metadata"])
    identifier=doc.split("identifier: ")[1].split(" ",1)[0]
    #resume=doc.split("title: ")[1]
    docs_structured_sents[identifier]={"metadata":[sent.text for sent in nlp(doc).sents]}
  if index%400==0: #there are >2900 positions
    print(index)

0


In [None]:
# test
docs_structured_sents[test_doc_id]['metadata']
docs_structured_sents[test_doc_id]['== Conclusion ==']
# docs_structured_sents[test_doc_id]

In [None]:
# Export `docs_structured_sents`

#JSON or pickle dump of docs_structured_sents (dict with chapters and sentences)
import json
with open('encpos_structured_sents.json', "w", encoding='utf8') as f:
    json.dump(docs_structured_sents, f, indent=2, ensure_ascii=False)

## Keywords extraction (`keywords_by_doc`)

### Télécharger la liste des keywords par doc

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_keywords_by_doc.json
keywords_by_doc = {}        # dict of docs with their discriminant keywords
with open('/content/encpos_keywords_by_doc.json') as json_file:
  keywords_by_doc = json.load(json_file) 

--2022-07-20 11:58:02--  https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_keywords_by_doc.json
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_keywords_by_doc.json [following]
--2022-07-20 11:58:02--  https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_keywords_by_doc.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2256103 (2.2M) [text/plain]
Saving to: ‘encpos_keywords_by_doc.json’


2022-07-20 11:58:03 (33.3 MB/s) - ‘encpos_keywords_by_doc.json’ saved [2256103/

In [None]:
keywords_by_doc[test_doc_id]

'saint cisterciennes fontenay bibliothèques clairvaux pontigny abbaye inventaires documents manuscrits catalogue sources médiéval bibliothèque paris signatures cahier parchemin support peaux écriture manuscrits clairvaux littérature bibliothèque manuscrits bibliques liturgiques fontenay abbaye cloître emplacement sacristie claustri dépôts abbaye armarium bibliothèque bois seules recouverts abbaye huit siècle manuscrits médiévales libris livres fonds biblique bible bibliothèque bibliothécaire abbaye contenue liste moine médiévale bibliothèque abbaye manuscrit fontenay léon siècle libris apost lettres bibliothèque abbaye vatican françois pierre bibliothèque généalogie textes fontenay manuscrits abbaye bourgogne bibliothèque baptiste bibliothécaire manuscrits colbert abbé abbaye 1650 œuvres bibl saint fontenay baluze abbaye manuscrit liste bourgogne église témoignage livre littéraire ouvrages manuscrits bouhier font rédigée auteur bibliothèque catalogue liste manuscrits manuscrits bibliot

### (OU) Calculer les keywords

In [None]:
#Loading extraction functions

from sklearn.feature_extraction.text import CountVectorizer

# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1') #multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

# nltk for stopwords and punct? Only HERE ?
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')

!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from sklearn.metrics.pairwise import cosine_similarity

# test, régler fuzz
# fuzz.ratio('bibliothécaires', 'bibliothèques')

"""
The idea behind is to compare the full paragraph embbeding against all possible 8-words combination embeddings (candidates)
Then, we select the 8-words closest (by cosine similarity) to the full paragraph embeddings, as they are our more representative keywords.  
"""
def key_extractor(doc, top_n=8, n_gram_range = (1, 1)): #function to extract keywords from a text, n_gram_range indicates the matrice range of candidates
  vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])

  candidates = vectorizer.get_feature_names_out() 
  candidate_embeddings = model.encode(candidates) # fréquence des mots plus important que leur ordre (embedding de la fréquence du lexique dans chaque bloc) ? 

  doc_embedding = model.encode([doc])

  distances = cosine_similarity(doc_embedding, candidate_embeddings) # expliquer
  keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

  #optional to filter similar keywords inside the same group (vg. Bibliothèque, bibliothècaire)
  keywords_fuzzy=[]
  for i, x in enumerate(keywords):
    if i>0:
      if any(fuzz.ratio(y,x)>85 for y in keywords_fuzzy):
        continue
      else:
        keywords_fuzzy.append(x)
    else:
      keywords_fuzzy.append(x)

  return keywords_fuzzy



"""
TODO: filtrer way de filtrer les keywords trop proches – cf fuzzy method de key_extractor(), en mieux
"""
def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):#extraction by cosine embeddings similarity
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




In [None]:
# DEBUG VJ pour keywords
# store keywords vector for each doc

from collections import Counter

count=0
keywords_by_doc = {} # stocker les mots clé de chaque doc

for doc_id in docs_structured.keys():
  # print(doc_id)
  keyword_bloc=""
  for k,v in docs_structured[doc_id].items():
    if k!="metadata":
      if len(v)>10: # des chapitres avec uniquement des sauts de lignes…
        try:
          keyword_bloc+=" ".join(key_extractor(v))+" "
        except:
          continue
  keywords_by_doc[doc_id] = keyword_bloc

  count+=1
  if count%900==0:
    print(count)

In [None]:
# Export
#JSON dump of keywords_by_doc (dict with chapters and sentences)
import json
with open('encpos_keywords_by_doc.json', "w", encoding='utf8') as f:
    json.dump(keywords_by_doc, f, indent=2, ensure_ascii=False)

In [None]:
# TEST SERGIO // SUPPRIMER ???
#keyword extraction for the test position
#apply key_extractor on each text div (chapter)
for k,v in docs_structured_sents[test_doc_id].items():
  bloc=" ".join(v)
  if len(bloc)>10:
    print("\t", k,"\n")
    bloc=" ".join(v)
    #print("\t", bloc)
    print("\t", key_extractor(bloc), "\n\n")
  else:
    print(k,"\n")
    print(bloc)

## Entities extraction (`entities_by_doc`)

TODO. Expliquer pourquoi on travaille au niveau de la phrase (`docs_structured_sents`)

Long à calculer. On peut :

1. Télécharger la liste déjà calculée
1. Calculer la *liste*

#### 1. Télécharger la liste des entités par doc

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_entities_by_doc.json
entities_by_doc = {}        # dict of docs with their discriminant entities
with open('/content/encpos_entities_by_doc.json') as json_file:
  entities_by_doc = json.load(json_file) 


--2022-07-20 11:58:07--  https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_entities_by_doc.json
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_entities_by_doc.json [following]
--2022-07-20 11:58:08--  https://raw.githubusercontent.com/chartes/encpos_similarities/master/data_structured/encpos_entities_by_doc.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1309295 (1.2M) [text/plain]
Saving to: ‘encpos_entities_by_doc.json’


2022-07-20 11:58:08 (24.4 MB/s) - ‘encpos_entities_by_doc.json’ saved [1309295/

In [None]:
entities_by_doc[test_doc_id]

['Clairvaux',
 'Paris',
 'Fontenay',
 'Cîteaux',
 'La Ferté',
 'saint Bernard',
 'Yves de Chartres',
 'saint Ambroise',
 'bibliothèque de Fontenay',
 'Baluze',
 'Colbert',
 'saint Jérôme',
 'abbaye de Fontenay',
 'Apocalypse',
 'Geoffroy d’Auxerre',
 'Basile de Césarée',
 'Hugues de Saint-Victor',
 'Bouchu',
 'bibliothèque de Colbert',
 'Raban',
 'sanctae Crucis']

#### 2. (OU) Calculer les entités discriminantes

In [None]:
#installing spacy
!pip install -U pip setuptools wheel # ?
!pip install -U spacy[cuda92,transformers,lookups]
import spacy

#Download language model for modern french. If you get a downloading error you must restart the runtime
!python -m spacy download fr_core_news_lg
nlp=spacy.load("fr_core_news_lg") #spacy linguistic model for french news large

In [None]:
"""
function to extract meaning relationships between entities based on (Spacy) dependencies
TODO
text = doc ? ou sentence ?
"""
def rel_extraction(text):
  keys=["ROOT", "nsubj"]
  keys_2=["LOC", "PER", "ORG"]

  doc_dep = nlp(text)
  doc_dep=[[tok.text, tok.dep_] for tok in doc_dep]

  doc_ents= spacy_large_ner(text, nlp)
  doct_ents=list(doc_ents)
  doc_rel=["O"]*len(doc_dep)
  for ent in list(doc_ents): doc_rel[ent[-2]:ent[-1]]=[ent[-3]]*(ent[-1]-ent[-2])

  #doc_merged=["\t".join(x+[y]) for x,y in list(zip(doc_dep, doc_rel))]
  doc_merged=[x+[y] for x,y in list(zip(doc_dep, doc_rel))]

  doc_merged=[x[0] for x in doc_merged if x[1] in keys or x[2] in keys_2 ]

  return doc_merged


"""
extract entities and character indexes
TODO
"""
def spacy_large_ner(document, model):
  return {(ent.text.strip(), ent.label_, ent.start, ent.end) for ent in model(document).ents}

"""
extract entities
TODO
"""
def spacy_short_ner(document, model): #extract just entities text
  return [ent.text.strip() for ent in model(document).ents]


In [None]:
# Save most representative entities of each entire doc
# TODO ST redocumenter
import itertools

entities_by_doc={} #
count=0
from collections import Counter

# voir avec ST : boucler sur doc_structured plutôt que docs_structured_sents
# permet de réduire le nombre d'itération et de le réduire au nombre de docs chargés
#for doc_id in docs_structured_sents.keys():

for doc_id in docs_structured.keys():
  #print(doc_id)
  macro_entidades=[]
  for k,v in docs_structured_sents[doc_id].items():
    if k!="metadata":
      for sent in v:
        if len(sent)>10:
          entidades=spacy_short_ner(str(sent), nlp)
          # ?? sentence level
          entidades=list(itertools.combinations(entidades, 2))
          macro_entidades.extend(entidades)

  nodes=[x[0] for x in Counter(list(sum(macro_entidades, ()))).most_common(25)]
  nodes=[x for x in nodes if len(x)>4]
  entities_by_doc[doc_id]=nodes
  #print('\t', entities_by_doc[doc_id])
  count+=1
  if count%500==0:
    print(count)

ENCPOS_1971_12
	 ['CC 12', 'États du Mâçonnais', 'Dijon']
ENCPOS_1971_25
	 ['Johannes Mülberg', 'Beginarum', 'Lolhardorum', 'Positio pro', 'Beginarum du Mineur', 'Rudolphe Buchsmann', 'Materia', 'Constance Félix Hemmerlin', 'Sébastien Brant', 'De valido mendicante', 'Nota contra', 'Lolhardos', 'Beghardos ac alios', 'dicunt res', 'Strasbourg', 'Constance', 'Predicatores N 5', 'Staatsarchiv', 'Kantons Basel-Stadt', 'Beginenstreit', 'Oddon de Colonna', 'Jacobinus de Torso', 'Cologne', 'Mayence']
ENCPOS_1971_15
	 ['Jacques Despars', 'Faculté de médecine de Paris', 'Avicenne', 'Canon', 'Antidotarium', 'Guillaume Bernard', 'Galien', 'Canon d’', 'Tournai', 'Bibliothèque Nationale', 'I, fen 1', 'Albert le Grand', 'Johannitius', 'Isaac', 'Jean Damascène', 'Dictionnaire', 'France', 'Ernest Wickersheimer', 'Eudes de Creil', 'Guillaume', 'M. Jacques Monfrin', 'Synonyma', 'Simon de Gênes', 'Arabes']
ENCPOS_2002_29
	 ['Clairvaux', 'Fontenay', 'Cîteaux', 'Paris', 'La Ferté', 'Yves de Chartres', 'Colb

In [None]:
entities_by_doc

In [None]:
# TEST SKIP utile pour tester/redocumenter
#extract contextual entites to build a Knownledge graph

from collections import Counter
macro_entidades=[]
for k,v in docs_structured_sents[test_doc_id].items():
  if k!="metadata":
    for sent in v:
      if len(sent)>10:
        entidades=spacy_short_ner(str(sent), nlp)
        entidades=list(itertools.combinations(entidades, 2))
        macro_entidades.extend(entidades)

nodes=[x[0] for x in Counter(list(sum(macro_entidades, ()))).most_common(25)]
nodes=[x for x in nodes if len(x)>4]
nodes

In [None]:
# Export
#JSON dump of entities_by_doc (dict with chapters and sentences)
import json
with open('encpos_entities_by_doc.json', "w", encoding='utf8') as f:
    json.dump(entities_by_doc, f, indent=2, ensure_ascii=False)


In [None]:
#Loading libraries and packages

# nltk for stopwords and punct?
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')

from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
import itertools
#import wikipedia
import json



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#installing Flair, spacy, model languages and sentence transformers. This can take a while
#remember delete the displayed information after installing
#don't forget to switch to a GPU environment

# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')#multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

#!pip install Flair
#!pip install wikipedia

In [None]:
!nvidia-smi

# Vectors dicts and Models

In [None]:
# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')#multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

### Keywords vectors (`keywords_vectors`)

---



#### Download

In [None]:
import numpy as np
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_keywords_vectors.npz
keywords_vectors_dump = np.load('encpos_keywords_vectors.npz')
keywords_vectors = {doc_id:keywords_vectors_dump[doc_id] for doc_id in keywords_vectors_dump.files}

--2022-07-20 11:58:58--  https://github.com/chartes/encpos_similarities/raw/master/models/encpos_keywords_vectors.npz
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/chartes/encpos_similarities/master/models/encpos_keywords_vectors.npz [following]
--2022-07-20 11:58:58--  https://raw.githubusercontent.com/chartes/encpos_similarities/master/models/encpos_keywords_vectors.npz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6852484 (6.5M) [application/octet-stream]
Saving to: ‘encpos_keywords_vectors.npz.1’


2022-07-20 11:58:58 (81.5 MB/s) - ‘encpos_keywords_vectors.npz.1’ saved [6852484/6852484]


In [None]:
print(keywords_vectors[test_doc_id])

#### Compute

In [None]:
# DEBUG VJ pour keywords
# store keywords vector for each doc // LONG!

import sys
from collections import Counter

count=0
keywords_vectors = {}  # dict pour stoker le vecteur des keywords pour chaque doc

for doc_id in docs_structured.keys():
  keywords_vectors[doc_id]=model.encode([keywords_by_doc[doc_id]])

  count+=1
  if count%900==0:
    print(count)

In [None]:
keywords_vectors[test_doc_id]

In [None]:
# Export
np.savez('encpos_keywords_vectors', **keywords_vectors)


In [None]:
# utile où ???
full_text_keywords = list(keywords_by_doc.values())

## Entities vectors

TODO : expliquer que cette représentation du texte ne sert finalement plus par la suite

### Download

In [None]:
import numpy as np
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_entities_vectors.npz
entities_vectors_dump = np.load('encpos_entities_vectors.npz')
entities_vectors = {doc_id:entities_vectors_dump[doc_id] for doc_id in entities_vectors_dump.files}

In [None]:
entities_vectors[test_doc_id]

### Compute

In [None]:
# DEBUG VJ pour entities – valider avec ST
# store entities vector for each doc

from collections import Counter
entities_vectors={} # dict pour stcoker le vecteur des entités pour chaque doc

count=0

entities_dict={entity:i for i, entity in enumerate(Counter([y for x in entities_by_doc.values() for y in x]).keys())} # expliquer
# print(entities_dict) # des occs à 0 – souhaité ?

for doc_id in docs_structured.keys():
  ents=[entities_dict[x] for x in entities_by_doc[doc_id]] # mv entities entites_by_doc chargé depuis la source json (on ne calcule plus)
  entities_vectors[doc_id]=ents

  count+=1
  if count%900==0:
    print(count)

# entities_vectors

In [None]:
entities_vectors[test_doc_id]

In [None]:
# Export
import numpy as np
np.savez('encpos_entities_vectors', **entities_vectors)

## Documents vectors

### Download

In [None]:
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_document_vectors.npz
document_vectors_dump = np.load('encpos_document_vectors.npz')
document_vectors = {doc_id:document_vectors_dump[doc_id] for doc_id in document_vectors_dump.files}

--2022-07-20 11:59:18--  https://github.com/chartes/encpos_similarities/raw/master/models/encpos_document_vectors.npz
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/chartes/encpos_similarities/master/models/encpos_document_vectors.npz [following]
--2022-07-20 11:59:19--  https://raw.githubusercontent.com/chartes/encpos_similarities/master/models/encpos_document_vectors.npz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6852484 (6.5M) [application/octet-stream]
Saving to: ‘encpos_document_vectors.npz’


2022-07-20 11:59:19 (85.5 MB/s) - ‘encpos_document_vectors.npz’ saved [6852484/6852484]



In [None]:
document_vectors[test_doc_id]

### Compute

In [None]:
# DEBUG VJ pour document_vectors – valider avec ST

from collections import Counter

count=0
document_vectors={}

for doc_id in docs_structured.keys():
  bloc = ""
  for k,v in docs_structured[doc_id].items():
    if k!="metadata":
      bloc+=v
    # print(v)
  doc_embedding = model.encode([bloc])
  document_vectors[doc_id]=doc_embedding

  count+=1
  if count%900==0:
    print(count)

In [None]:
document_vectors.keys()

In [None]:
# Export
np.savez('encpos_document_vectors', **document_vectors)

## Doc2Vec (Optional)

### Download (`d2v_model`)

In [52]:
from gensim.models.doc2vec import Doc2Vec
d2v_model= Doc2Vec.load("https://github.com/chartes/encpos_similarities/raw/master/models/encpos_doc2vec.model")

### Train

In [None]:
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')


In [None]:
#transforming data
data={k:"".join(list(v.values())).replace("=  ", "") for k,v in docs_structured.items()}
data
tagged_data = [TaggedDocument(words=[x for x in word_tokenize(v.lower()) if x not in stop_words], tags=[str(k)]) for k,v in data.items()]

In [None]:
# tagged_data[10][-1]

In [None]:
#modeling vectors, this can take a while
max_epochs = 20
vec_size = 30
alpha = 0.025

model_d2v = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1, window=10, workers=4)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.iter)
    # decrease the learning rate
    model_d2v.alpha -= 0.0002
    # fix the learning rate, no decay
    model_d2v.min_alpha = model_d2v.alpha

print("end modelization")

In [None]:
# Export du modèle
model_d2v.save("encpos_doc2vec.model")

## Memo Sergio

In [None]:
#transforming each entities list and each text position into a vector
#THIS IS A MANDATORY STEP AND CAN TAKE SEVERAL MINUTES

from collections import Counter
entities_vectors={}
document_vectors={}
keyword_vectors={}


count=0

entities_dict={entity:i for i, entity in enumerate(Counter([y for x in entities.values() for y in x]).keys())}

full_text=[]
full_text_keys=[]

for item in entities.keys():
  bloc=""
  keyword_bloc=""
  #ents=np.array([entities_dict[x] for x in entities[item]], ndmin=2)#2D array
  ents=[entities_dict[x] for x in entities[item]]
  for k,v in docs_structured[item].items(): # pourquoi pas sur docs_structured_sents ???
    if k!="metadata":
      bloc+=v #join all chapters content without the title
      
      if len(v)>1:
        #print(key_extractor(v))
        try:
          keyword_bloc+=" ".join(key_extractor(v))+" "
          
        except:
          continue
  full_text.append(bloc) # on en fait quoi ? Pourquoi ?
  full_text_keys.append(keyword_bloc)

  doc_embedding = model.encode([bloc])
  document_vectors[item]=doc_embedding
  entities_vectors[item]=ents
  keyword_vectors[item]=model.encode([keyword_bloc])
  count+=1
  if count%900==0:
    print(count)

  #print(doc_embedding)

# Similarities (vector intersection ?)

In [None]:
#cosine function to compute vectors similarity
from sklearn.metrics.pairwise import cosine_similarity

def cos_similarity(a,b):
  
  cos_sim=cosine_similarity(a, b, dense_output=False).tolist()[0][0]
  return cos_sim

## Keywords

In [None]:
#similarity using KEYWORDS vectors
search_term=keywords_vectors[test_doc_id]

similar_terms=[[k, cosine_similarity(search_term, v)[0][0], docs_structured[k]["metadata"].split("creator:")[1] ] for k,v in keywords_vectors.items() if len(docs_structured[k])>1]
similar_terms=sorted(similar_terms, key = lambda x: x[1], reverse=True)[:10]
for x in similar_terms:
  print(*x, sep='\t')

ENCPOS_2002_29	0.9999999	 Dominique Stutzmann date: 2002 title: La bibliothèque de l’abbaye cistercienne de Fontenay (Côte-d’Or). Constitution, gestion, dissolution (xiie-xviiie siècle).    
ENCPOS_1990_17	0.84728587	 Françoise Simeray date: 1990 title: Le scriptorium et la bibliothèque de l’abbaye Saint-Amand    
ENCPOS_2015_12	0.8383621	 Hélène Jacquemard date: 2015 title: L’abbaye cistercienne de Vauclair et sa bibliothèque. Lire et écrire dans une abbaye cistercienne du Moyen Âge au xviiie siècle    
ENCPOS_1969_09	0.794045	 Marie-Pierre Laffitte-Pochat date: 1969 title: La bibliothèque et le scriptorium de Saint-Thierry de Reims    
ENCPOS_1998_35	0.79267997	 Laurent Veyssière date: 1998 title: Recueil des chartes de l’abbaye de Clairvaux.    
ENCPOS_2009_25	0.78906024	 Cécile Roger date: 2009 title: Guillaume de Saint-Lô, un prédicateur à l’œuvre au xive siècle    
ENCPOS_2010_23	0.7857414	 Cécile Roger date: 2010 title: Guillaume de Saint-Lô un prédicateur à l’œuvre au xive sièc

## Entities

In [None]:
# Similarity using ENTITIES vectors
cherche=entities_by_doc[test_doc_id]
similar_docs_number = 10
similar_docs = []

for doc_id in docs_structured.keys():
  try:
    coeff=(2*len(list(set(cherche) & set(entities_by_doc[doc_id]))))/(len(cherche)+len(entities_by_doc[doc_id]))
    if test_doc_id == doc_id:
      continue
    if coeff>0.05:
      similar_docs.append([coeff, doc_id, entities_by_doc[doc_id]])
  except:
    continue

similar_docs.sort(key=lambda x:x[0], reverse=True)
#similar_docs[0:similar_docs_number]
for l in similar_docs[0:similar_docs_number]:
  print(l[0:3])

[0.14285714285714285, 'ENCPOS_1922_01', ['Paris', 'Delamare', 'Traité de la Police', 'roi Jean', 'Bureau de la ville', 'Édit du mois', 'Colbert']]
[0.13953488372093023, 'ENCPOS_1986_15', ['Cisterciens', 'Paris', 'Clairvaux', 'Cîteaux', 'collège Saint-Bernard', 'Morimond', 'Benoît XII', 'Belgique', 'Occident', 'Étienne de Lexington', 'abbé de Clairvaux', 'Innocent IV', 'Jean Tolet', 'Alphonse de Poitiers', 'Montpellier', 'Estella', 'Toulouse', 'Oxford', 'Pontigny', 'Moyen Age', 'ordre de Cîteaux', 'Jean de Cirey']]
[0.13636363636363635, 'ENCPOS_1968_07', ['Servien', 'Mazarin', 'Paris', 'Conseil', 'Richelieu', 'Cherasco', 'Condé', 'Affaires étrangères', 'Anjou', 'Longueville', 'union des Frondes', 'Poitiers', 'Baluze', 'Dupuy', 'Cinq-cents', 'Colbert', 'Bibliothèque nationale', 'Archives nationales', 'Abel Servien', 'Chavigny', 'Münster', 'l’Empire', 'comte d’Avaux']]
[0.13636363636363635, 'ENCPOS_1970_11', ['Arras', 'Saint-Vaast', 'Jérémie', 'Cambrai', 'saint Vaast', 'Paris', 'Liber mir

## Documents

In [None]:
#similarity using DOCUMENT vectors
search_term=document_vectors[test_doc_id]

'''
for k,v in document_vectors.items():
  cos=cosine_similarity(search_term, v)
  if cos>0.45:
    print(k, "\t", cos[0], "\t", docs_structured[k]["metadata"].split("creator:")[1])
'''

similar_terms=[[k, cosine_similarity(search_term, v)[0][0], docs_structured[k]["metadata"].split("creator:")[1] ] for k,v in document_vectors.items() if len(docs_structured[k])>1]
similar_terms=sorted(similar_terms, key = lambda x: x[1], reverse=True)[:10]
for x in similar_terms:
  print(*x, sep='\t')

ENCPOS_2002_29	0.9999999	 Dominique Stutzmann date: 2002 title: La bibliothèque de l’abbaye cistercienne de Fontenay (Côte-d’Or). Constitution, gestion, dissolution (xiie-xviiie siècle).    
ENCPOS_2016_21	0.73231614	 Clémentine Villien date: 2016 title: L'église abbatiale cistercienne Notre-Dame d’Acey. Étude historique, architecturale et archéologique    
ENCPOS_2015_12	0.70256317	 Hélène Jacquemard date: 2015 title: L’abbaye cistercienne de Vauclair et sa bibliothèque. Lire et écrire dans une abbaye cistercienne du Moyen Âge au xviiie siècle    
ENCPOS_1926_02	0.68688256	 Anne-Marie Aubert date: 1926 title: Histoire et développement économique d’une abbaye cistercienne, Bellevaux en Franche-Comté (xiie-xvie siècle)    
ENCPOS_1951_01	0.68668956	 Bernard Bagneris date: 1951 title: La vie économique de l’abbaye de Montier-la-Celle du xive au xviie siècle    
ENCPOS_1997_08	0.68384457	 Stéphanie Billot date: 1997 title: Trois-Fontaine, fille aînée de Clairvaux : étude et édition du cha

## Doc2Vec

In [None]:
#similarity using Doc2Vec model
similar_doc = d2v_model.docvecs.most_similar(test_doc_id, topn=40)
for x in similar_doc:
  try:
    print(x[0], round(x[1], 5), docs_structured[x[0]]["metadata"].split("creator:")[1])
  except:
    print(x)

ENCPOS_1933_13 0.44863  Régine Pernoud date: 1933 title: Essai sur le port de Marseille des origines à la fin du xiiie siècle    
ENCPOS_1994_04 0.40778  Michelle Bubenicek date: 1994 title: Le pouvoir au féminin. Une princesse en politique et son entourage : Yolande de Flandre, comtesse de Bar et dame de Cassel (1326-1395)    
ENCPOS_1971_12 0.37944  Alain Guerreau date: 1971 title: Une ville et ses finances : Mâcon    
ENCPOS_1971_25 0.37358  Jean-Claude Schmitt date: 1971 title: L’Église et les clercs face aux béguines et aux beghards du Rhin supérieur du xive siècle au xve siècle    
ENCPOS_1971_15 0.34851  Danielle Jacquart date: 1971 title: Un médecin parisien du xve siècle, Jacques Despars (1380-1458)    
ENCPOS_2003_19 0.32172  Sébastien Nadiras date: 2003 title: Guillaume de Nogaret et la pratique du pouvoir    
ENCPOS_1972_18 0.31963  Michel Pastoureau date: 1972 title: Le bestiaire héraldique au Moyen Âge    
ENCPOS_1999_35 0.30514  Elsa Marguin date: 1999 title: L’Ars lecto

# Evaluation

## Keywords similarities matrix

In [45]:
# Calculate keywords similarities matrix 3000 X 3000

matrix={}
liste_encpos=list(docs_structured.keys())

for i, pos in enumerate(liste_encpos):
  matrix[pos]=[]
  for pos_b in liste_encpos:
    if pos!=pos_b:
          matrix[pos].append([pos_b, cos_similarity(keywords_vectors[pos], keywords_vectors[pos_b])])
  if i%500==0:
    print(i)


0
500
1000
1500
2000
2500


In [46]:
# Export keywords_similarities_matrix
import json
with open('keywords_similarities_matrix.json', 'w', encoding='utf8') as f:
    json.dump(matrix, f, indent=2, ensure_ascii=False)

## Document similarities matrix

In [47]:
# Calculate document similarities matrix 3000x3000

matrix_docs={}
liste_encpos=list(docs_structured.keys())

for i, pos in enumerate(liste_encpos):
  matrix_docs[pos]=[]
  for pos_b in liste_encpos:
    if pos!=pos_b:
          matrix_docs[pos].append([pos_b, cos_similarity(document_vectors[pos], document_vectors[pos_b])])
  if i%500==0:
    print(i)

0
500
1000
1500
2000
2500


In [48]:
# Export document_similarities_matrix
import json
with open('document_similarities_matrix.json', 'w', encoding='utf8') as f:
    json.dump(matrix, f, indent=2, ensure_ascii=False)

## Doc2Vec similarities matrix

In [None]:
# Calculate Doc2Vec similarities matrix 3000x3000

matrix_d2v={}
liste_encpos=list(docs_structured.keys())

for i, pos in enumerate(liste_encpos):
  for pos_b in liste_encpos:
    if pos!=pos_b:
      coeff=cos_similarity(model_d2v.docvecs[pos].reshape(1, -1), model_d2v.docvecs[pos_b].reshape(1, -1))
      for ii, x in enumerate(matrix_d2v[pos]):
        if pos_b in x[0]:
          matrix_d2v[pos][ii]=matrix_d2v[pos][ii]+[coeff]
  if i%500==0:
    print(i)

import json
with open('doc2vec_similarities_matrix.json', 'w') as fp:
    json.dump(matrix_d2v, fp)

In [None]:
cos_similarity(model_d2v.docvecs["ENCPOS_2000_10"].reshape(1, -1), model_d2v.docvecs["ENCPOS_2000_09"].reshape(1, -1))

## Mesure de la déviation entre les scores de similarité

TODO: expliquer

In [None]:

#full_text=[]
#full_text_keys=[]
data=[len(x.split()) for x in full_text_keys]

from statistics import mean, median,variance,stdev

m = mean(data)
median = median(data)
variance = variance(data)
stdev = stdev(data)
print('average: {0:.2f}'.format(m))
print('Median: {0:.2f}'.format(median))
print('Distributed: {0:.2f}'.format(variance))
print('standard deviation: {0:.2f}'.format(stdev))

# A insérer ?

### entity linking wikipedia

In [None]:
# SKIP
#entity linking by using french and english wikipedia 
for node in nodes:
  try:
    wikipedia.set_lang("fr")
    x=wikipedia.page(node, auto_suggest=False)
    print(node, x.url)
  except (wikipedia.exceptions.DisambiguationError) as e:
    try:
      options=e.options
      #x=[y for y in x if any(z in wikipedia.summary(y, auto_suggest=False) for z in nodes)]
      print(options)
    except:
      continue
  except wikipedia.PageError:
    wikipedia.set_lang("en")
    try:
      x=wikipedia.page(node, auto_suggest=False)
      print(node, x.url)
    except (wikipedia.exceptions.DisambiguationError, wikipedia.PageError) as e:
      try:
        options=e.options
        #x=[y for y in x if any(z in wikipedia.summary(y, auto_suggest=False) for z in nodes)]
        print(options)
      except:
        continue

In [None]:
cherche=model.encode(["Auxerrois"])

for k,v in document_vectors.items():
  cos=cosine_similarity(cherche, v)
  if cos>0.1:
    print(k, "\t", cos[0], "\t", docs_structured[k]["metadata"].split("creator:")[1])

In [None]:
cherche=entities_by_doc["ENCPOS_1972_18"]

for item in entities_by_doc.keys():
  try:
    a=[x for x in cherche if x in entities_by_doc[item]]
    #list(set(cherche) & set(entities[item]))
    if len(a)>3:
      print(item, docs_structured[item]["metadata"].split("creator:")[1], a)
  except:
    continue

In [55]:
cos_similarity(document_vectors["ENCPOS_2011_08"], document_vectors["ENCPOS_1988_10"])

0.14194782078266144