In this notebook, we use the [document API](https://api.bnf.fr/fr/api-document-de-gallica#scroll-nav__10__2) from Gallica to get the text that are surrounding our illustrations. To do this, we will first need to find back the Gallica IDs from the documents that have at least one illustration, and then we will query their API and convert the documents into the needed format for later topic modelling.

### Retrieving the Gallica documents

In [1]:
# Some imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.de.stop_words import STOP_WORDS as de_stop
from collections import Counter
import json
import langid

In [2]:
# Data file, the same used for Vikus Viewer
data = pd.read_csv("data/data.csv")

In [3]:
# A function to query the Gallica API
# And parse the response to only get the text
def from_soup_to_string(request_url):
    response = requests.get(request_url) # Request API
    soup = BeautifulSoup(response.content, 'html.parser') # Parse response
    soup_str = soup.prettify()
    tmp = soup_str[soup_str.index("<hr/>") + 5 :] # The text is between <hr/> tags
    if "<hr/>" not in tmp:
        return ""
    html_text = tmp[:tmp.index("<hr/>")]
    p_texts = BeautifulSoup(html_text, 'html.parser').find_all('p') # Extract and concatenate all the texts in <p> divs
    whole_text = ""
    for x in p_texts:
        whole_text += x.text.replace("\n", "").replace("\\", "")
    return whole_text.strip()

In [13]:
# Select Gallica subset
data_gallica = data[data["_iiif-link"].str.contains("gallica")]
dict_text = dict()
dict_text_per_page = dict()
for doc in tqdm(data_gallica.iterrows()):
    # Add entry in dictionnary with the data entry ID and page as key, and the text as value
    dict_text_per_page[doc[1]["id"].split("_")[1] + "_" + doc[1]["id"].split("_")[2]] = from_soup_to_string(doc[1]["_iiif-link"] + ".texteBrut")

3499it [11:16,  5.17it/s]


In [27]:
# If there are multiple pages in the same data entry, concatenate them all 
for k, v in dict_text_per_page.items():
    dict_text[k.split("_")[0]] = dict_text.get(k.split("_")[0], "") + v

In [29]:
data_gallica.sample()

Unnamed: 0,id,_description,_artist,_source,_material,_dimensions,_journal-id,_date-artwork,year,keywords,_link-dfkv,_iiif-link,_journal-name,_link-dfkv-md,_link-iiif-md
9287,ILLU_16301_167_0,,,,,,1302,,1949,"Reproduction, Large Illustration",https://dfkv.dfkg.org/ng/index.html#/records/1...,https://gallica.bnf.fr/ark:/12148/bpt6k4226334...,Cahiers d'art,[HERE](https://dfkv.dfkg.org/ng/index.html#/re...,[HERE](https://gallica.bnf.fr/ark:/12148/bpt6k...


In [40]:
# Save texts to json
with open('data/complete_texts_gallica.json', 'w') as fp:
    json.dump(dict_text, fp)
with open('data/complete_texts_page_gallica.json', 'w') as fp:
    json.dump(dict_text_per_page, fp)

## Create Bag-of-Word files

Now that we have all the texts that are around the illustrations, we can prepare the data for further use. We first clean the texts by removing special characters, that we list bellow

In [40]:
with open("./data/complete_texts_gallica.json", "r") as fp:
    dict_text = json.load(fp)

In [20]:
# Classify between French and German texts
german = dict()
french = dict()

for k, t in tqdm(data.items()):
    language = langid.classify(t.replace("\n", " "))
    if language[0] == "de":
        german[k] = t.replace("\n", " ")
    else:
        french[k] = t.replace("\n", " ")

100%|██████████| 444/444 [00:03<00:00, 123.24it/s]


In [41]:
to_remove = ["\n", "'", ",", ";", ":", ".", "!", "?", "’", "(", ")", "\"", "%", "#", "$", "&", "*", "+", '-', "[", ">", "]", "_", "”", "“", "="]

In [42]:
def tokenize(doc):
    # Remove the special characters
    for p in to_remove:
        doc = doc.replace(p, " ")
    # Put everything in lowercase
    tokens = doc.lower().split(" ")
    # Exclde stopwords and words that have a digit or weird character in them
    tokens = [t for t in tokens if t not in fr_stop and t not in de_stop and t != '' and len(t)>2 and "^" not in t and "•" not in t and "<" not in t and "/" not in t and not any(map(str.isdigit, t))]
    return Counter(tokens)

In [43]:
# Tokenize each text
dict_token_docs = dict()
for k, v in dict_text.items():
    dict_token_docs[k] = tokenize(v)

In [44]:
# Vocabulary
vocab = set()
for counter in dict_token_docs.values():
    vocab = vocab.union(set(counter.keys()))

In [45]:
# Alphabetically sort the vocabulary
vocab = sorted(list(vocab))

In [47]:
# Create a new vocab list that we will use to store the actual words written in the vocab document
new_vocab = []

In [48]:
# Create vocabulary document
# Contains the list of words
# One word per line
with open('data/vocab.dfkv.txt', 'a') as f:
    f.truncate(0)
    for v in vocab:
        try:
            f.writelines(str(v) + "\n")
            new_vocab.append(v)
        except:
            vocab.remove(v)

In [49]:
# Create the docword document
with open('data/docword.dfkv.txt', 'a') as f:
    f.truncate(0)
    total_tokens = sum([sum(list(c.values())) for c in dict_token_docs.values()])
    # Header lines :
    # Number of documents
    # Number of unique words
    # Number of words in total
    f.writelines([str(len(dict_token_docs.keys())) + "\n",  str(len(new_vocab))+ "\n", str(total_tokens) + "\n"])
    for k, v in tqdm(sorted(dict_token_docs.items())):
        try:
            for w, n in v.items():
                # Each line consists of :
                # DOC_ID WORD_ID WORD_COUNT
                to_write = str(k) + " " + str((new_vocab.index(w) + 1)) + " " + str(n) + "\n"
                f.writelines(to_write)
        except:
            pass

100%|██████████| 444/444 [02:11<00:00,  3.38it/s]
