In [63]:
test_doc_id = 'ENCPOS_2002_29' # ENCPOS_1972_18

# Loading the dataset (`docs_structured`)

In [None]:
#download, unzip and reading encpos dataset
import glob

!wget https://github.com/chartes/encpos_similarities/raw/master/data/encpos_txt.zip
!unzip encpos_txt.zip -d /content/

docs = [] # list of documents (a single line string for each doc)
docs=[open(filename, "r").readlines() for filename in glob.glob("/content/encpos_txt/*.txt")]
docs=[" ".join(x).replace("\n", "") for x in docs]

In [27]:
# Extracting chapters structuration from texts

import re

docs_structured={} # dict of docs to process ('metadata' + one item for each text div)
for index, doc in enumerate(docs):
  chapters=re.findall(r"==.+?\==" , doc) #match chapters (and subchaters) titles as the pattern is "===" and "=="
  regexPattern = '|'.join(map(re.escape, chapters))
  a=re.split(regexPattern, doc)

  text_structured={x:y for x,y in list(zip(["metadata"]+chapters, a))}
  try:
    identifier=text_structured["metadata"].split("identifier: ")[1].split(" ", 1)[0].replace(" ", "")
    docs_structured[identifier]=text_structured
  except:
    #print(text_structured["metadata"])
    identifier=doc.split("identifier: ")[1].split(" ",1)[0]
    resume=doc.split("title: ")[1]
    docs_structured[identifier]={"title":resume}
    #print(texts[index], "\n")

In [None]:
# test
docs_structured[test_doc_id]['metadata']
docs_structured[test_doc_id]

# Features extraction

## Sentences segmentation (`docs_structured_sents`)

#### 1. Télécharger le corpus segmenté en phrases

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_structured_sents.json

docs_structured_sents = {}  # dict of docs (with their 'metadata' + one item for each text div with a list of its sentences)

with open('/content/encpos_structured_sents.json') as json_file:
  docs_structured_sents = json.load(json_file)

# docs_structured_sents.keys()
# docs_structured_sents['ENCPOS_2002_29']['metadata']

#### 2. (OU) Segmenter le corpus en phrases

In [None]:
#Option 2: extracting chapter structuration with sentences split (it can take a while)

# Spacy required for senteces segmentation
!pip install -U spacy[cuda92,transformers,lookups] # tout est requis ici ?
!python -m spacy download fr_core_news_lg
import spacy
nlp=spacy.load("fr_core_news_lg") #spacy linguistic model for french news large

docs_structured_sents={} # dict of docs ('metadata' + one item for each text div with a list of its sentences)
for index, doc in enumerate(docs):
  chapters=re.findall(r"==.+?\==" , doc) #chapters and subchaters as the pattern is "===" and "=="
  regexPattern = '|'.join(map(re.escape, chapters))
  a=re.split(regexPattern, doc)

  text_structured={x:y for x,y in list(zip(["metadata"]+chapters, a))}
  try:
    identifier=text_structured["metadata"].split("identifier: ")[1].split(" ", 1)[0].replace(" ", "")
    text_structured={k:[sent.text for sent in nlp(v).sents] for k,v in text_structured.items()}
    docs_structured_sents[identifier]=text_structured
  except:
    #print(text_structured["metadata"])
    identifier=doc.split("identifier: ")[1].split(" ",1)[0]
    #resume=doc.split("title: ")[1]
    docs_structured_sents[identifier]={"metadata":[sent.text for sent in nlp(doc).sents]}
  if index%400==0: #there are >2900 positions
    print(index)

In [None]:
# test
docs_structured_sents[test_doc_id]['metadata']
docs_structured_sents[test_doc_id]['== Conclusion ==']
# docs_structured_sents[test_doc_id]

In [None]:
# Export `docs_structured_sents`

#JSON or pickle dump of docs_structured_sents (dict with chapters and sentences)
import json
with open('encpos_structured_sents.json', "w", encoding='utf8') as f:
    json.dump(docs_structured_sents, f, indent=2, ensure_ascii=False)

## Keywords extraction (`keywords_by_doc`)

### Télécharger la liste des keywords par doc

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_keywords_by_doc.json
keywords_by_doc = {}        # dict of docs with their discriminant keywords
with open('/content/encpos_keywords_by_doc.json') as json_file:
  keywords_by_doc = json.load(json_file) 

In [None]:
keywords_by_doc[test_doc_id]

### (OU) Calculer les keywords

In [None]:
#Loading extraction functions

from sklearn.feature_extraction.text import CountVectorizer

# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1') #multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

# nltk for stopwords and punct? Only HERE ?
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')

!pip install fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from sklearn.metrics.pairwise import cosine_similarity

# test, régler fuzz
# fuzz.ratio('bibliothécaires', 'bibliothèques')

"""
The idea behind is to compare the full paragraph embbeding against all possible 8-words combination embeddings (candidates)
Then, we select the 8-words closest (by cosine similarity) to the full paragraph embeddings, as they are our more representative keywords.  
"""
def key_extractor(doc, top_n=8, n_gram_range = (1, 1)): #function to extract keywords from a text, n_gram_range indicates the matrice range of candidates
  vectorizer = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])

  candidates = vectorizer.get_feature_names_out() 
  candidate_embeddings = model.encode(candidates) # fréquence des mots plus important que leur ordre (embedding de la fréquence du lexique dans chaque bloc) ? 

  doc_embedding = model.encode([doc])

  distances = cosine_similarity(doc_embedding, candidate_embeddings) # expliquer
  keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

  #optional to filter similar keywords inside the same group (vg. Bibliothèque, bibliothècaire)
  keywords_fuzzy=[]
  for i, x in enumerate(keywords):
    if i>0:
      if any(fuzz.ratio(y,x)>85 for y in keywords_fuzzy):
        continue
      else:
        keywords_fuzzy.append(x)
    else:
      keywords_fuzzy.append(x)

  return keywords_fuzzy



"""
TODO: filtrer les keywords trop proches – cf fuzzy method de key_extractor(), en mieux
"""
def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):#extraction by cosine embeddings similarity
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [None]:
# DEBUG VJ pour keywords
# store keywords vector for each doc

from collections import Counter

count=0
keywords_by_doc = {} # stocker les mots clé de chaque doc

for doc_id in docs_structured.keys():
  # print(doc_id)
  keyword_bloc=""
  for k,v in docs_structured[doc_id].items():
    if k!="metadata":
      if len(v)>10: # des chapitres avec uniquement des sauts de lignes…
        try:
          keyword_bloc+=" ".join(key_extractor(v))+" "
        except:
          continue
  keywords_by_doc[doc_id] = keyword_bloc

  count+=1
  if count%900==0:
    print(count)

In [None]:
# Export
#JSON dump of keywords_by_doc (dict with chapters and sentences)
import json
with open('encpos_keywords_by_doc.json', "w", encoding='utf8') as f:
    json.dump(keywords_by_doc, f, indent=2, ensure_ascii=False)

## Entities extraction (`entities_by_doc`)

TODO. Expliquer pourquoi on travaille au niveau de la phrase (`docs_structured_sents`)

Long à calculer. On peut :

1. Télécharger la liste déjà calculée
1. Calculer la *liste*

#### 1. Télécharger la liste des entités par doc

In [None]:
import json
!wget https://github.com/chartes/encpos_similarities/raw/master/data_structured/encpos_entities_by_doc.json
entities_by_doc = {}        # dict of docs with their discriminant entities
with open('/content/encpos_entities_by_doc.json') as json_file:
  entities_by_doc = json.load(json_file) 


In [None]:
entities_by_doc['test_doc_id']

#### 2. (OU) Calculer les entités discriminantes

In [None]:
#installing spacy
!pip install -U pip setuptools wheel # ?
!pip install -U spacy[cuda92,transformers,lookups]
import spacy

#Download language model for modern french. If you get a downloading error you must restart the runtime
!python -m spacy download fr_core_news_lg
nlp=spacy.load("fr_core_news_lg") #spacy linguistic model for french news large

In [None]:
"""
function to extract meaning relationships between entities based on (Spacy) dependencies
TODO
text = doc ? ou sentence ?
"""
def rel_extraction(text):
  keys=["ROOT", "nsubj"]
  keys_2=["LOC", "PER", "ORG"]

  doc_dep = nlp(text)
  doc_dep=[[tok.text, tok.dep_] for tok in doc_dep]

  doc_ents= spacy_large_ner(text, nlp)
  doct_ents=list(doc_ents)
  doc_rel=["O"]*len(doc_dep)
  for ent in list(doc_ents): doc_rel[ent[-2]:ent[-1]]=[ent[-3]]*(ent[-1]-ent[-2])

  #doc_merged=["\t".join(x+[y]) for x,y in list(zip(doc_dep, doc_rel))]
  doc_merged=[x+[y] for x,y in list(zip(doc_dep, doc_rel))]

  doc_merged=[x[0] for x in doc_merged if x[1] in keys or x[2] in keys_2 ]

  return doc_merged


"""
extract entities and character indexes
TODO
"""
def spacy_large_ner(document, model):
  return {(ent.text.strip(), ent.label_, ent.start, ent.end) for ent in model(document).ents}

"""
extract entities
TODO
"""
def spacy_short_ner(document, model): #extract just entities text
  return [ent.text.strip() for ent in model(document).ents]


In [None]:
# Save most representative entities of each entire doc
# TODO ST redocumenter
import itertools

entities_by_doc={} #
count=0
from collections import Counter

# voir avec ST : boucler sur doc_structured plutôt que docs_structured_sents
# permet de réduire le nombre d'itération et de le réduire au nombre de docs chargés
#for doc_id in docs_structured_sents.keys():

for doc_id in docs_structured.keys():
  #print(doc_id)
  macro_entidades=[]
  for k,v in docs_structured_sents[doc_id].items():
    if k!="metadata":
      for sent in v:
        if len(sent)>10:
          entidades=spacy_short_ner(str(sent), nlp)
          # ?? sentence level
          entidades=list(itertools.combinations(entidades, 2))
          macro_entidades.extend(entidades)

  nodes=[x[0] for x in Counter(list(sum(macro_entidades, ()))).most_common(25)]
  nodes=[x for x in nodes if len(x)>4]
  entities_by_doc[doc_id]=nodes
  #print('\t', entities_by_doc[doc_id])
  count+=1
  if count%500==0:
    print(count)

In [None]:
# TEST SKIP utile pour tester/redocumenter
#extract contextual entites to build a Knownledge graph

from collections import Counter
macro_entidades=[]
for k,v in docs_structured_sents[test_doc_id].items():
  if k!="metadata":
    for sent in v:
      if len(sent)>10:
        entidades=spacy_short_ner(str(sent), nlp)
        entidades=list(itertools.combinations(entidades, 2))
        macro_entidades.extend(entidades)

nodes=[x[0] for x in Counter(list(sum(macro_entidades, ()))).most_common(25)]
nodes=[x for x in nodes if len(x)>4]
nodes

In [None]:
# Export
#JSON dump of entities_by_doc (dict with chapters and sentences)
import json
with open('encpos_entities_by_doc.json', "w", encoding='utf8') as f:
    json.dump(entities_by_doc, f, indent=2, ensure_ascii=False)


In [None]:
#Loading libraries and packages

# nltk for stopwords and punct?
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')

from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
import pandas as pd
import itertools
#import wikipedia
import json



In [None]:
#installing Flair, spacy, model languages and sentence transformers. This can take a while
#remember delete the displayed information after installing
#don't forget to switch to a GPU environment

# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')#multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

#!pip install Flair
#!pip install wikipedia

In [None]:
!nvidia-smi

# Vectors dicts and Models

In [None]:
# Bert
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')#multilingual model for keywork extraction, text summarization and sentence transformation (0.7 PR)

### Keywords vectors (`keywords_vectors`)

---



#### Download

In [None]:
import numpy as np
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_keywords_vectors.npz
keywords_vectors_dump = np.load('encpos_keywords_vectors.npz')
keywords_vectors = {doc_id:keywords_vectors_dump[doc_id] for doc_id in keywords_vectors_dump.files}

In [None]:
print(keywords_vectors[test_doc_id])

#### Compute

In [None]:
# DEBUG VJ pour keywords
# store keywords vector for each doc // LONG!

import sys
from collections import Counter

count=0
keywords_vectors = {}  # dict pour stoker le vecteur des keywords pour chaque doc

for doc_id in docs_structured.keys():
  keywords_vectors[doc_id]=model.encode([keywords_by_doc[doc_id]])

  count+=1
  if count%900==0:
    print(count)

In [None]:
keywords_vectors[test_doc_id]

In [None]:
# Export
np.savez('encpos_keywords_vectors', **keywords_vectors)


In [None]:
# utile où ???
full_text_keywords = list(keywords_by_doc.values())

## Entities vectors

TODO : expliquer que cette représentation du texte ne sert finalement plus par la suite

### Download

In [None]:
import numpy as np
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_entities_vectors.npz
entities_vectors_dump = np.load('encpos_entities_vectors.npz')
entities_vectors = {doc_id:entities_vectors_dump[doc_id] for doc_id in entities_vectors_dump.files}

In [None]:
entities_vectors[test_doc_id]

### Compute

In [None]:
# DEBUG VJ pour entities – valider avec ST
# store entities vector for each doc

from collections import Counter
entities_vectors={} # dict pour stcoker le vecteur des entités pour chaque doc

count=0

entities_dict={entity:i for i, entity in enumerate(Counter([y for x in entities_by_doc.values() for y in x]).keys())} # expliquer
# print(entities_dict) # des occs à 0 – souhaité ?

for doc_id in docs_structured.keys():
  ents=[entities_dict[x] for x in entities_by_doc[doc_id]] # mv entities entites_by_doc chargé depuis la source json (on ne calcule plus)
  entities_vectors[doc_id]=ents

  count+=1
  if count%900==0:
    print(count)

# entities_vectors

In [None]:
entities_vectors[test_doc_id]

In [None]:
# Export
import numpy as np
np.savez('encpos_entities_vectors', **entities_vectors)

## Documents vectors

### Download

In [None]:
import numpy as np
!wget https://github.com/chartes/encpos_similarities/raw/master/models/encpos_document_vectors.npz
document_vectors_dump = np.load('encpos_document_vectors.npz')
document_vectors = {doc_id:document_vectors_dump[doc_id] for doc_id in document_vectors_dump.files}

In [None]:
document_vectors[test_doc_id]

### Compute

In [None]:
# DEBUG VJ pour document_vectors – valider avec ST

from collections import Counter

count=0
document_vectors={}

for doc_id in docs_structured.keys():
  bloc = ""
  for k,v in docs_structured[doc_id].items():
    if k!="metadata":
      bloc+=v
    # print(v)
  doc_embedding = model.encode([bloc])
  document_vectors[doc_id]=doc_embedding

  count+=1
  if count%900==0:
    print(count)

In [None]:
document_vectors.keys()

In [None]:
# Export
np.savez('encpos_document_vectors', **document_vectors)

## Doc2Vec (Optional)

### Download (`d2v_model`)

In [None]:
from gensim.models.doc2vec import Doc2Vec
d2v_model= Doc2Vec.load("https://github.com/chartes/encpos_similarities/raw/master/models/encpos_doc2vec.model")

### Train

In [None]:
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = stopwords.words('french')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#transforming data
data={k:"".join(list(v.values())).replace("=  ", "") for k,v in docs_structured.items()}
data
tagged_data = [TaggedDocument(words=[x for x in word_tokenize(v.lower()) if x not in stop_words], tags=[str(k)]) for k,v in data.items()]

In [None]:
#new method to calculate TaggedDocuments
data={k:"".join([" ".join(z) for y,z in v.items() if y!="metadata" and len(z)>1]) for k,v in docs_structured_sents.items()}
#data=[[k,"-"] if len(v)<50 else [k,v]]
data={k:"-" if len(v)<50 else v for k,v in data.items()}
tagged_data = [TaggedDocument(words=[x for x in word_tokenize(v.lower()) if x not in stop_words], tags=[str(k)]) for k,v in data.items()]

In [None]:
#modeling vectors, this can take a while
max_epochs = 20
vec_size = 30
alpha = 0.025

model_d2v = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1, window=10, workers=4)
  
model_d2v.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.iter)
    # decrease the learning rate
    model_d2v.alpha -= 0.0002
    # fix the learning rate, no decay
    model_d2v.min_alpha = model_d2v.alpha

print("end modelization")

In [None]:
# Export du modèle
model_d2v.save("encpos_doc2vec.model")

## Memo Sergio

In [None]:
#transforming each entities list and each text position into a vector
#THIS IS A MANDATORY STEP AND CAN TAKE SEVERAL MINUTES

from collections import Counter
entities_vectors={}
document_vectors={}
keyword_vectors={}


count=0

entities_dict={entity:i for i, entity in enumerate(Counter([y for x in entities.values() for y in x]).keys())}

full_text=[]
full_text_keys=[]

for item in entities.keys():
  bloc=""
  keyword_bloc=""
  #ents=np.array([entities_dict[x] for x in entities[item]], ndmin=2)#2D array
  ents=[entities_dict[x] for x in entities[item]]
  for k,v in docs_structured[item].items(): # pourquoi pas sur docs_structured_sents ???
    if k!="metadata":
      bloc+=v #join all chapters content without the title
      
      if len(v)>1:
        #print(key_extractor(v))
        try:
          keyword_bloc+=" ".join(key_extractor(v))+" "
          
        except:
          continue
  full_text.append(bloc) # on en fait quoi ? Pourquoi ?
  full_text_keys.append(keyword_bloc)

  doc_embedding = model.encode([bloc])
  document_vectors[item]=doc_embedding
  entities_vectors[item]=ents
  keyword_vectors[item]=model.encode([keyword_bloc])
  count+=1
  if count%900==0:
    print(count)

  #print(doc_embedding)

# Similarities

In [None]:
#cosine function to compute vectors similarity
from sklearn.metrics.pairwise import cosine_similarity

def cos_similarity(a,b):
  
  cos_sim=cosine_similarity(a, b, dense_output=False).tolist()[0][0]
  return cos_sim

## Keywords vectors

In [66]:
#similarity using KEYWORDS vectors
search_term=keywords_vectors[test_doc_id]

similar_terms=[[k, cosine_similarity(search_term, v)[0][0], docs_structured[k]["metadata"].split("creator:")[1] ] for k,v in keywords_vectors.items() if len(docs_structured[k])>1]
similar_terms=sorted(similar_terms, key = lambda x: x[1], reverse=True)[:20]
for x in similar_terms:
  if x[0]!=test_doc_id:
    shared_keywords = [y for y in set(keywords_by_doc[test_doc_id].split()) if y in set(keywords_by_doc[x[0]].split()) ]
    #x.append(sorted(shared_keywords)) # ajout des keywords à la liste
    print(*x, sep='\t')
    print('\t\t', shared_keywords, '\n')

ENCPOS_1990_17	0.84728587	 Françoise Simeray date: 1990 title: Le scriptorium et la bibliothèque de l’abbaye Saint-Amand    
		 ['bibliothécaire', 'abbé', 'auteur', 'manuscrits', 'manuscrit', 'écriture', 'bibliothèques', 'libris', 'liturgiques', 'ouvrages', 'saint', 'abbaye', 'emplacement', 'église', 'bibliothèque', 'bibliques', 'siècle', 'documents'] 

ENCPOS_2015_12	0.8383621	 Hélène Jacquemard date: 2015 title: L’abbaye cistercienne de Vauclair et sa bibliothèque. Lire et écrire dans une abbaye cistercienne du Moyen Âge au xviiie siècle    
		 ['abbé', 'manuscrits', 'manuscrit', 'médiéval', 'libris', 'liturgiques', 'ouvrages', 'abbaye', 'livres', 'église', 'bibliothèque', 'clairvaux', 'liste', 'fonds', 'abbés', 'documents'] 

ENCPOS_1969_09	0.794045	 Marie-Pierre Laffitte-Pochat date: 1969 title: La bibliothèque et le scriptorium de Saint-Thierry de Reims    
		 ['vatican', 'textes', 'abbé', 'médiévales', 'manuscrits', 'manuscrit', 'écriture', 'inventaires', 'lettres', 'libris', 'li

## Entities lists

In [64]:
# Similarity using ENTITIES vectors
cherche=entities_by_doc[test_doc_id]
similar_docs_number = 10
similar_docs = []

for doc_id in docs_structured.keys():
  try:
    coeff=(2*len(list(set(cherche) & set(entities_by_doc[doc_id]))))/(len(cherche)+len(entities_by_doc[doc_id]))
    if test_doc_id == doc_id:
      continue
    if coeff>0.05:
      similar_docs.append([round(coeff,7), doc_id, entities_by_doc[doc_id]])
  except:
    continue

similar_docs.sort(key=lambda x:x[0], reverse=True)
#similar_docs[0:similar_docs_number]
for l in similar_docs[0:similar_docs_number]:
  print(l[1], l[0], docs_structured[l[1]]["metadata"].split("creator:")[1], '\n\t\t', set(l[2]), '\n')

ENCPOS_1922_01 0.1428571  André Barroux date: 1922 title: Essai sur le guet ordinaire à Paris. Son organisation de 1364 à 1559. Son évocation depuis la réforme de 1559 jusqu’à l’établissement des « cavaliers de l’ordonnance » en 1666     
		 {'Colbert', 'Édit du mois', 'Bureau de la ville', 'Delamare', 'Paris', 'roi Jean', 'Traité de la Police'} 

ENCPOS_1986_15 0.1395349  Caroline Obert-Piketty date: 1986 title: Les maîtres et étudiants du collège Saint-Bernard de Paris de 1224 à 1494     
		 {'abbé de Clairvaux', 'Jean de Cirey', 'Moyen Age', 'Benoît XII', 'Paris', 'Pontigny', 'Innocent IV', 'Cisterciens', 'collège Saint-Bernard', 'Toulouse', 'Cîteaux', 'Estella', 'Occident', 'Montpellier', 'Jean Tolet', 'Oxford', 'Morimond', 'Clairvaux', 'Étienne de Lexington', 'Belgique', 'Alphonse de Poitiers', 'ordre de Cîteaux'} 

ENCPOS_1970_11 0.1363636  Denis Escudier date: 1970 title: Le scriptorium de Saint-Vaast d’Arras des origines au xiie siècle. Contribution à l’étude des notations neum

## Documents

In [None]:
#similarity using DOCUMENT vectors
searched_doc_vec=document_vectors[test_doc_id]
similar_docs = []
'''
for k,v in document_vectors.items():
  cos=cosine_similarity(searched_doc_vec, v)
  if cos>0.45:
    print(k, "\t", cos[0], "\t", docs_structured[k]["metadata"].split("creator:")[1])
'''
similar_docs=[[k, cosine_similarity(searched_doc_vec, v)[0][0], docs_structured[k]["metadata"].split("creator:")[1] ] for k,v in document_vectors.items() if len(docs_structured[k])>1]
similar_docs=sorted(similar_docs, key = lambda x: x[1], reverse=True)[:10]
for x in similar_docs:
  if x[0]!=test_doc_id: # todo
    print(*x, sep='\t')



ENCPOS_1986_07	0.58974755	 Thibaut Girard date: 1986 title: Le Traité du droit et du comportement des armes de Du Cange. Édition critique et commentaire    
ENCPOS_2008_12	0.54115784	 Stéphanie Deprouw date: 2008 title: Un héritage des Bonaparte : Le prix du galvanisme (1802-1815) et le prix Volta (1852-1888). L’État et l’encouragement à la recherche sur l’électricité    
ENCPOS_1975_13	0.5291034	 Anita Guerreau-Jalabert date: 1975 title: Grammaire et culture profane à Fleury au xe siècle. Les Quaestiones grammaticales d’Abbon de Fleury    
ENCPOS_1981_18	0.50414956	 Annick Notter date: 1981 title: Le culte des saints en Sologne aux xixe et xxe siècles    
ENCPOS_1998_18	0.49594107	 Isabelle Homer date: 1998 title: Médecins et chirurgiens à Saint-Domingue au xviiie siècle    
ENCPOS_1997_37	0.4866734	 Nicolas Roche date: 1997 title: Les armoiries imaginaires des personnages de l’Antiquité, de l’Orient et de la Bible (xiie-xviie siècle)    
ENCPOS_2011_01	0.47874796	 Justine Ancelin dat

## Doc2Vec

In [None]:
#similarity using Doc2Vec model
searched_doc = test_doc_id
similar_docs = d2v_model.docvecs.most_similar(searched_doc, topn=15)
for x in similar_docs:
  try:
    print(x[0], round(x[1], 5), docs_structured[x[0]]["metadata"].split("creator:")[1])
  except:
    print(x)

ENCPOS_1933_13 0.46514  Régine Pernoud date: 1933 title: Essai sur le port de Marseille des origines à la fin du xiiie siècle    
ENCPOS_1971_12 0.46075  Alain Guerreau date: 1971 title: Une ville et ses finances : Mâcon    
ENCPOS_1994_04 0.37603  Michelle Bubenicek date: 1994 title: Le pouvoir au féminin. Une princesse en politique et son entourage : Yolande de Flandre, comtesse de Bar et dame de Cassel (1326-1395)    
ENCPOS_1999_35 0.37088  Elsa Marguin date: 1999 title: L’Ars lectoria Ecclesie de Jean de Garlande. Étude, édition et traduction    
ENCPOS_1971_15 0.3624  Danielle Jacquart date: 1971 title: Un médecin parisien du xve siècle, Jacques Despars (1380-1458)    
ENCPOS_1971_25 0.35464  Jean-Claude Schmitt date: 1971 title: L’Église et les clercs face aux béguines et aux beghards du Rhin supérieur du xive siècle au xve siècle    
ENCPOS_2000_08 0.33392  Olivier Canteaut date: 2000 title: Philippe V et son Conseil : le gouvernement royal de 1316 à 1322    
ENCPOS_2003_19 0.3

## Search by term

In [74]:
cherche=model.encode(["couleur"]) # musique, amour, mort, charte, animal, sport, couleur
related_docs = []

related_docs=[[k, cosine_similarity(cherche, v)[0], docs_structured[k]["metadata"].split("creator:")[1] ] for k,v in document_vectors.items() if len(docs_structured[k])>1]
related_docs=sorted(related_docs, key = lambda x: x[1], reverse=True)[:20]
for x in related_docs:
  print(*x, sep='\t')


ENCPOS_1991_08	[0.3652228]	 Sylvie Fayet date: 1991 title: L’expression de la couleur dans les textes littéraires latins du xiie siècle. Contribution au lexique et éléments d’un imaginaire    
ENCPOS_1995_35	[0.25018558]	 Inès Villela-Petit date: 1995 title: La peinture médiévale vers 1400, autour d’un manuscrit de Jean Lebègue. Édition des Libri colorum    
ENCPOS_2017_07	[0.17699459]	 Julie Duprat date: 2017 title: Présences noires à Bordeaux : passage et intégration des gens de couleur à la fin du xviiie siècle    
ENCPOS_2015_17	[0.10069939]	 Thomas Morel date: 2015 title: Quand un prélat monte à la capitale. Tristan de Salazar et l’hôtel parisien des archevêques de Sens    
ENCPOS_1922_11	[0.09992971]	 Henry Joly date: 1922 title: L’expédition de la Corse (1553-1559). Épisode de la rivalité franco-espagnole dans la Méditerranée occidentale    
ENCPOS_1964_08	[0.09765615]	 Lise Dupouy date: 1964 title: Les protestants de Florac de la révocation de l’Édit de Nantes à l’Édit de tolér

## Search by number of common entities




In [77]:
# Search by docs, using shared entities
cherche=entities_by_doc[test_doc_id]
related_docs = []

for doc_id in entities_by_doc.keys():
  try:
    a=[x for x in cherche if x in entities_by_doc[doc_id]]
    related_docs.append([doc_id, len(a), a, docs_structured[doc_id]["metadata"].split("creator:")[1]])
  except:
    continue

related_docs=sorted(related_docs, key = lambda x: x[1], reverse=True)[:20]   
for x in related_docs:
  if test_doc_id!=x[0]:
    print(*x, sep='\t')

ENCPOS_1901_02	3	['Paris', 'Baluze', 'Colbert']	 René Bonnat date: 1901 title: Nicolas de La Reynie, premier lieutenant de police    
ENCPOS_1965_11	3	['Paris', 'Cîteaux', 'Baluze']	 Odile Tran-Thuan-Biausse date: 1965 title: Recueil des plus anciens actes de l’abbaye d’Yerres (1132-1265)    
ENCPOS_1970_11	3	['Paris', 'Cîteaux', 'saint Jérôme']	 Denis Escudier date: 1970 title: Le scriptorium de Saint-Vaast d’Arras des origines au xiie siècle. Contribution à l’étude des notations neumatiques du nord de la France    
ENCPOS_1986_15	3	['Clairvaux', 'Paris', 'Cîteaux']	 Caroline Obert-Piketty date: 1986 title: Les maîtres et étudiants du collège Saint-Bernard de Paris de 1224 à 1494    
ENCPOS_1953_16	3	['Paris', 'Cîteaux', 'saint Bernard']	 Jean-François Maurel date: 1953 title: Jean Beleth et la Summa de ecclesiasticis officiis    
ENCPOS_1973_12	3	['saint Ambroise', 'saint Jérôme', 'Hugues de Saint-Victor']	 Cécile Eymard date: 1973 title: Les Marguerites Hystorialles de Jehan Massue.

## Search by number of common keywords

In [76]:
# Search by docs, using shared keywords
cherche=set(keywords_by_doc[test_doc_id].split()) # On dédoublonne ou pas ?
keywords_by_doc_tokenized = {}
related_docs = []

for doc_id in keywords_by_doc.keys():
  keywords_by_doc_tokenized[doc_id] = set(keywords_by_doc[doc_id].split())
  try:
    a=[x for x in cherche if x in keywords_by_doc_tokenized[doc_id]]
    related_docs.append([doc_id, len(a), docs_structured[doc_id]["metadata"].split("creator:")[1], a])
  except:
    continue

related_docs=sorted(related_docs, key = lambda x: x[1], reverse=True)[:20]   
for x in related_docs:
  if test_doc_id!=x[0]:
    print(*x, sep='\t')

ENCPOS_1969_09	22	 Marie-Pierre Laffitte-Pochat date: 1969 title: La bibliothèque et le scriptorium de Saint-Thierry de Reims    	['vatican', 'textes', 'abbé', 'médiévales', 'manuscrits', 'manuscrit', 'écriture', 'inventaires', 'lettres', 'libris', 'liturgiques', 'ouvrages', 'catalogue', 'saint', 'abbaye', 'église', 'bibliothèque', 'liste', 'fonds', 'siècle', 'abbés', 'bible']
ENCPOS_1975_14	18	 Catherine Gutowski date: 1975 title: Le traité De Avaricia extrait de la Summa de Viciis de Guillaume Peyraut. Édition critique partielle et commentaire    	['médiévale', 'biblique', 'auteur', 'manuscrits', 'médiéval', 'bibliothèques', 'littéraire', 'œuvres', 'littérature', 'auteurs', 'ouvrages', 'catalogue', 'église', 'bibliothèque', 'bibliophile', 'bibliques', 'siècle', 'bible']
ENCPOS_2001_18	18	 Karine Rebmeister date: 2001 title: La bibliothèque médiévale du collège des Cholets    	['médiévale', 'manuscrits', 'sources', 'edition', 'médiéval', 'paris', 'bibliothèques', 'libris', 'liturgique

In [None]:
# Search by docs, using shared keywords // IDEM SANS DÉDOUBLONNER (DISCUTER AVEC ST)
cherche=keywords_by_doc[test_doc_id].split()
keywords_by_doc_tokenized = {}
related_docs = []

for doc_id in keywords_by_doc.keys():
  keywords_by_doc_tokenized[doc_id] = keywords_by_doc[doc_id].split()
  try:
    a=[x for x in cherche if x in keywords_by_doc_tokenized[doc_id]]
    related_docs.append([doc_id, len(a), docs_structured[doc_id]["metadata"].split("creator:")[1], a])
  except:
    continue

related_docs=sorted(related_docs, key = lambda x: x[1], reverse=True)[:20]   
for x in related_docs:
  if test_doc_id!=x[0]:
    print(*x, sep='\t')

ENCPOS_2013_16	25	 Guy Mayaud date: 2013 title: L’érudition héraldique au xviie siècle : la question des origines des armoiries    	['étude', 'héraldique', 'bibliothèque', 'armoiries', 'armoiries', 'objets', 'monuments', 'armoiries', 'armoiries', 'médiévale', 'armoiries', 'armoiries', 'armoiries', 'armes', 'armoiries', 'armes', 'armoiries', 'armes', 'armoiries', 'héraldiques', 'armes', 'armoiries', 'armoiries', 'étude', 'héraldique']
ENCPOS_1997_37	25	 Nicolas Roche date: 1997 title: Les armoiries imaginaires des personnages de l’Antiquité, de l’Orient et de la Bible (xiie-xviie siècle)    	['héraldique', 'armoiries', 'armoriaux', 'armoiries', 'armoriaux', 'armoiries', 'lion', 'armoiries', 'armoiries', 'armoiries', 'lion', 'armoiries', 'armes', 'armoiries', 'armes', 'armoiries', 'lion', 'armes', 'armoiries', 'symboliques', 'armes', 'armoiries', 'armoiries', 'héraldique', 'historien']
ENCPOS_2020_06	23	 Élisabeth Charron date: 2020 title: Le lion et la couronne. Les Estouteville, le roi

# Evaluation

## Keywords similarities matrix

In [None]:
# Calculate keywords similarities matrix 3000 X 3000

matrix_keywords={}
liste_encpos=sorted(list(docs_structured.keys())) # trier pour que les matrices aient la même structure

for i, pos in enumerate(liste_encpos):
  matrix_keywords[pos]=[]
  for pos_b in liste_encpos:
    if pos!=pos_b:
      matrix_keywords[pos].append([pos_b, cos_similarity(keywords_vectors[pos], keywords_vectors[pos_b])])
  if i%500==0:
    print(i)

0
500
1000
1500
2000
2500


In [None]:
# Export keywords_similarities_matrix
import json
with open('keywords_similarities_matrix.json', 'w', encoding='utf8') as f:
    json.dump(matrix_keywords, f, indent=2, ensure_ascii=False)

## Document similarities matrix

In [None]:
# Calculate document similarities matrix 3000x3000

matrix_docs={}
liste_encpos=sorted(list(docs_structured.keys()))

for i, pos in enumerate(liste_encpos):
  matrix_docs[pos]=[]
  for pos_b in liste_encpos:
    if pos!=pos_b:
          matrix_docs[pos].append([pos_b, cos_similarity(document_vectors[pos], document_vectors[pos_b])])
  if i%500==0:
    print(i)

0
500
1000
1500
2000
2500


In [None]:
# Export document_similarities_matrix
import json
with open('document_similarities_matrix.json', 'w', encoding='utf8') as f:
    json.dump(matrix_docs, f, indent=2, ensure_ascii=False)

## Doc2Vec similarities matrix

In [None]:
# Calculate Doc2Vec similarities matrix 3000x3000 // TEST

matrix_d2v={}
liste_encpos=sorted(list(docs_structured.keys()))

for i, pos in enumerate(liste_encpos):
  matrix_d2v[pos]=[]
  for pos_b in liste_encpos:
    if pos!=pos_b:
      coeff=cos_similarity(d2v_model.docvecs[pos].reshape(1, -1), d2v_model.docvecs[pos_b].reshape(1, -1))
      matrix_d2v[pos].append([pos_b, coeff])
  if i%500==0:
    print(i)

0
500
1000
1500
2000
2500


In [None]:
import json
with open('doc2vec_similarities_matrix.json', 'w', encoding='utf8') as f:
    json.dump(matrix_d2v, f, indent=2, ensure_ascii=False)

In [None]:
d2v_model.docvecs

In [None]:
# Calculate Doc2Vec similarities matrix 3000x3000 // REJET ne fonctionne pas à discuter (debug ci-dessus) / valider avec ST

matrix_d2v={}
liste_encpos=list(docs_structured.keys())

for i, pos in enumerate(liste_encpos):
  for pos_b in liste_encpos:
    if pos!=pos_b:
      coeff=cos_similarity(d2v_model.docvecs[pos].reshape(1, -1), d2v_model.docvecs[pos_b].reshape(1, -1))
      for ii, x in enumerate(matrix_d2v[pos]): # ICI, POURQUOI ?
        if pos_b in x[0]:
          matrix_d2v[pos][ii]=matrix_d2v[pos][ii]+[coeff]
  if i%500==0:
    print(i)

In [None]:
# test
cos_similarity(model_d2v.docvecs["ENCPOS_2000_10"].reshape(1, -1), model_d2v.docvecs["ENCPOS_2000_09"].reshape(1, -1))

0.1885162740945816

## Mesure de la déviation entre les scores de similarité

all vs all scheme

In [21]:
#all vs all comparison, we get a list with the absolute difference between each two similarity scores (all vs all) calculated for each method (keywords, documents and doc2vec)
from statistics import mean, median, variance, stdev

def similarities_methods_comparison(matrix_a, matrix_b):
  all2all = lambda a, b : [abs(item[1]-b[k][i][1]) for k,v in a.items() for i,item in enumerate(v)] 
  all2all = all2all(matrix_a, matrix_b) #pass two matrix dictionaries as args
  print('mean: {0:.3f}'.format(mean(all2all)))
  print('median: {0:.3f}'.format(median(all2all)))
  print('var: {0:.3f}'.format(variance(all2all)))
  print('stdev: {0:.3f}'.format(stdev(all2all)))


In [None]:
!wget https://github.com/chartes/encpos_similarities/raw/master/evaluation/keywords_similarities_matrix.json
keywords_similarities_matrix = {}
with open('/content/keywords_similarities_matrix.json', 'r') as json_file:
  keywords_similarities_matrix = json.load(json_file)

In [None]:
!wget https://github.com/chartes/encpos_similarities/raw/master/evaluation/document_similarities_matrix.json
document_similarities_matrix = {}
with open('/content/document_similarities_matrix.json', 'r') as json_file:
  document_similarities_matrix = json.load(json_file)

In [None]:
!wget https://github.com/chartes/encpos_similarities/raw/master/evaluation/doc2vec_similarities_matrix.json
doc2vec_similarities_matrix = {}
with open('/content/doc2vec_similarities_matrix.json', 'r') as json_file:
  doc2vec_similarities_matrix = json.load(json_file)

In [22]:
# keywords vs doc2vec
print('keywords vs doc2vec')
similarities_methods_comparison(keywords_similarities_matrix,
                                doc2vec_similarities_matrix)

keywords vs doc2vec
mean: 0.177
median: 0.154
var: 0.017
stdev: 0.129


In [23]:
# document (distilbert) vs doc2vec
print('document (distilbert) vs doc2vec')
similarities_methods_comparison(document_similarities_matrix,
                                doc2vec_similarities_matrix)

document (distilbert) vs doc2vec
mean: 0.163
median: 0.141
var: 0.014
stdev: 0.119


In [24]:
# keywords vs document (distilbert)
print('keywords vs document (distilbert)')
similarities_methods_comparison(keywords_similarities_matrix,
                                document_similarities_matrix)

keywords vs document (distilbert)
mean: 0.091
median: 0.076
var: 0.005
stdev: 0.070


# Rejets / mémo

### entity linking wikipedia

In [None]:
# SKIP
#entity linking by using french and english wikipedia 
for node in nodes:
  try:
    wikipedia.set_lang("fr")
    x=wikipedia.page(node, auto_suggest=False)
    print(node, x.url)
  except (wikipedia.exceptions.DisambiguationError) as e:
    try:
      options=e.options
      #x=[y for y in x if any(z in wikipedia.summary(y, auto_suggest=False) for z in nodes)]
      print(options)
    except:
      continue
  except wikipedia.PageError:
    wikipedia.set_lang("en")
    try:
      x=wikipedia.page(node, auto_suggest=False)
      print(node, x.url)
    except (wikipedia.exceptions.DisambiguationError, wikipedia.PageError) as e:
      try:
        options=e.options
        #x=[y for y in x if any(z in wikipedia.summary(y, auto_suggest=False) for z in nodes)]
        print(options)
      except:
        continue

In [None]:
docs_structured[test_doc_id]['metadata']

'identifier: ENCPOS_1972_18 creator: Michel Pastoureau date: 1972 title: Le bestiaire héraldique au Moyen Âge    '

In [None]:
cos_similarity(document_vectors["ENCPOS_2011_08"], document_vectors["ENCPOS_1988_10"])

0.14194782078266144

In [46]:
# TEST (SUPPRIMER) /// Similarity using keywords lists
cherche=keywords_by_doc[test_doc_id]
similar_docs_number = 10
similar_docs = []

for doc_id in docs_structured.keys():
  try:
    coeff=(2*len(list(set(cherche) & set(keywords_by_doc[doc_id]))))/(len(cherche)+len(keywords_by_doc[doc_id]))
    if test_doc_id == doc_id:
      continue
    if coeff>0.05:
      similar_docs.append([coeff, doc_id, keywords_by_doc[doc_id]])
  except:
    continue

similar_docs.sort(key=lambda x:x[0], reverse=True)
#similar_docs[0:similar_docs_number]
for l in similar_docs[0:similar_docs_number]:
  print(*l[0:3])

In [None]:
# SUPPRIMER ? Sergio
cherche=model.encode(["animal"])
related_docs = []

for k,v in document_vectors.items():
  cos=cosine_similarity(cherche, v)
  if cos>0.1:
    # issues to fix with docs metadata.
    try:
      print(k, "\t", cos[0], "\t", docs_structured[k]["metadata"].split("creator:")[1])
    except:
      continue