In [431]:
import json
import pandas as pd
from lxml import etree as etree
import re

#  SQL queries

```sql
-- Expressions in Hefte

SELECT expr.pk_entity, expr.entity_label
FROM war.entity_preview expr
WHERE expr.fk_class = 218
AND   expr.fk_project = 924033
AND  expr.entity_label ~'Heft'
ORDER BY expr.pk_entity;


-- tous les textes avec carnets et sections, sans les textes - seulement metadonnées
SELECT DISTINCT expr.pk_entity id_manuscript,
       expr.entity_label title_manuscript,
       st.fk_subject_info AS id_section,
       sect.entity_label title_section,
       sect.fk_type,
      max(sect.full_text),
       st1.fk_subject_data id_digital,
       dita.entity_version text_version,
       dita.pk_text id_text
FROM war.entity_preview expr
  JOIN war.statement st ON st.fk_object_info = expr.pk_entity
  JOIN war.entity_preview sect ON st.fk_subject_info = sect.pk_entity
  JOIN information.statement st1 ON st1.fk_object_info = sect.pk_entity
  JOIN data.digital dita ON dita.pk_entity = st1.fk_subject_data
  JOIN projects.info_proj_rel t3 ON t3.fk_entity = st1.pk_entity
WHERE expr.fk_class = 218
AND   expr.fk_project = 924033
AND   expr.entity_label ~ 'Heft'
AND   st.fk_project = 924033
AND   st.fk_property = 1317
AND   t3.fk_project = 924033
AND   t3.is_in_project IS TRUE
AND   st1.fk_property = 1216
GROUP BY  expr.pk_entity,
       expr.entity_label,
       st.fk_subject_info,
       sect.entity_label,
       sect.fk_type,
       st1.fk_subject_data,
       dita.entity_version,
       dita.pk_text,
       dita.quill_doc
       ORDER BY expr.pk_entity,
         st.fk_subject_info
--LIMIT 10
;

-- tous les textes, avec carnets et sections, avec les textes 
SELECT DISTINCT expr.pk_entity id_manuscript,
       expr.entity_label title_manuscript,
       st.fk_subject_info AS id_section,
       sect.entity_label title_section,
       sect.fk_type,
      max(sect.full_text),
       st1.fk_subject_data id_digital,
       dita.entity_version text_version,
       dita.pk_text id_text,
       dita.quill_doc
FROM war.entity_preview expr
  JOIN war.statement st ON st.fk_object_info = expr.pk_entity
  JOIN war.entity_preview sect ON st.fk_subject_info = sect.pk_entity
  JOIN information.statement st1 ON st1.fk_object_info = sect.pk_entity
  JOIN data.digital dita ON dita.pk_entity = st1.fk_subject_data
  JOIN projects.info_proj_rel t3 ON t3.fk_entity = st1.pk_entity
WHERE expr.fk_class = 218
AND   expr.fk_project = 924033
AND   expr.entity_label ~ 'Heft'
AND   st.fk_project = 924033
AND   st.fk_property = 1317
AND   t3.fk_project = 924033
AND   t3.is_in_project IS TRUE
AND   st1.fk_property = 1216
GROUP BY  expr.pk_entity,
       expr.entity_label,
       st.fk_subject_info,
       sect.entity_label,
       sect.fk_type,
       st1.fk_subject_data,
       dita.entity_version,
       dita.pk_text,
       dita.quill_doc
       ORDER BY expr.pk_entity,
         st.fk_subject_info 
--LIMIT 10
;      
```

# Texts

In [698]:
texts_f = 'texte/all_texts_20220115.csv'

In [699]:
texts = pd.read_csv(texts_f, sep='#')
print(len(texts))
texts.head()

781


Unnamed: 0,id_manuscript,title_manuscript,id_section,title_section,fk_type,max,id_digital,text_version,id_text,quill_doc
0,835550,Heft 1 (1795),835573,23.11.1795,,"Section – Name: '23.11.1795', is part of: '(no...",24651936,10,23381914,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari..."
1,835550,Heft 1 (1795),835641,27.11.1795,,"Section – Name: '27.11.1795', is part of: '(no...",24651950,5,23381935,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari..."
2,835550,Heft 1 (1795),835651,28.11.1795,,"Section – Name: '28.11.1795', is part of: '(no...",24651951,8,23381937,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari..."
3,835550,Heft 1 (1795),835662,10.12.1795,,"Section – Name: '10.12.1795', is part of: '(no...",24651952,6,23381939,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari..."
4,835618,Heft 2 (1796),835671,XX.XX.1796,,"Section – Name: 'XX.XX.1796', is part of: '(no...",24651953,6,23381941,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari..."


In [700]:
def f_split (a):    
    try:
        out = [e.strip() for e in a.split(',') if 'has type' in e][0].split(':')[1]
    except:
        out = ''
    return out
    

In [701]:
texts['section_type'] = texts['max'].apply(lambda x : f_split(x))

In [702]:
texts.head()

Unnamed: 0,id_manuscript,title_manuscript,id_section,title_section,fk_type,max,id_digital,text_version,id_text,quill_doc,section_type
0,835550,Heft 1 (1795),835573,23.11.1795,,"Section – Name: '23.11.1795', is part of: '(no...",24651936,10,23381914,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'
1,835550,Heft 1 (1795),835641,27.11.1795,,"Section – Name: '27.11.1795', is part of: '(no...",24651950,5,23381935,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'
2,835550,Heft 1 (1795),835651,28.11.1795,,"Section – Name: '28.11.1795', is part of: '(no...",24651951,8,23381937,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'
3,835550,Heft 1 (1795),835662,10.12.1795,,"Section – Name: '10.12.1795', is part of: '(no...",24651952,6,23381939,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'
4,835618,Heft 2 (1796),835671,XX.XX.1796,,"Section – Name: 'XX.XX.1796', is part of: '(no...",24651953,6,23381941,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'


## Analyse textes

In [703]:
### Effectifs sections-textes par cahier
ts = texts.groupby('id_manuscript').size()   ##.count()
print(len(ts))
ts

32


id_manuscript
835550     4
835618     7
870956     1
871001     9
871145    19
872613     1
872634     6
872738     4
872829     5
872914    19
874182    65
884918    18
891345    27
891868    23
892262    14
897087    28
897353    30
897614    30
897895     3
897928    31
898223    28
898485    25
906998    45
907422    30
907703    26
907947    33
908256    36
908597    49
908609    49
908620    44
908630    35
908642    37
dtype: int64

In [704]:
### Liste des manuscrits (= cahiers, 'Hefte')
list(ts.index)

[835550,
 835618,
 870956,
 871001,
 871145,
 872613,
 872634,
 872738,
 872829,
 872914,
 874182,
 884918,
 891345,
 891868,
 892262,
 897087,
 897353,
 897614,
 897895,
 897928,
 898223,
 898485,
 906998,
 907422,
 907703,
 907947,
 908256,
 908597,
 908609,
 908620,
 908630,
 908642]

### Dictionnaire des textes : texts_li

In [705]:
texts_li = texts.to_dict(orient='records')
texts_li[0]

{'id_manuscript': 835550,
 'title_manuscript': 'Heft 1 (1795)',
 'id_section': 835573,
 'title_section': '23.11.1795',
 'fk_type': nan,
 'max': "Section – Name: '23.11.1795', is part of: '(no label)', has type: 'Tagebucheintrag', was created by: '23.11.1795'",
 'id_digital': 24651936,
 'text_version': 10,
 'id_text': 23381914,
 'quill_doc': '{"ops": [{"insert": "[", "attributes": {"charid": "2767"}}, {"insert": "2", "attributes": {"charid": "20"}}, {"insert": "3", "attributes": {"charid": "21"}}, {"insert": ".", "attributes": {"charid": "22"}}, {"insert": " ", "attributes": {"charid": "23"}}, {"insert": "N", "attributes": {"charid": "24"}}, {"insert": "o", "attributes": {"charid": "25"}}, {"insert": "v", "attributes": {"charid": "26"}}, {"insert": "e", "attributes": {"charid": "27"}}, {"insert": "m", "attributes": {"charid": "28"}}, {"insert": "b", "attributes": {"charid": "29"}}, {"insert": "e", "attributes": {"charid": "30"}}, {"insert": "r", "attributes": {"charid": "31"}}, {"insert

### Dictionnaire des cahiers : hefte_li

In [706]:
hefte_li = texts[['id_manuscript','title_manuscript']].drop_duplicates().to_dict(orient='records')
hefte_li

[{'id_manuscript': 835550, 'title_manuscript': 'Heft 1 (1795)'},
 {'id_manuscript': 835618, 'title_manuscript': 'Heft 2 (1796)'},
 {'id_manuscript': 870956, 'title_manuscript': 'Heft 3 (1796)'},
 {'id_manuscript': 871001, 'title_manuscript': 'Heft 4 (1796)'},
 {'id_manuscript': 871145, 'title_manuscript': 'Heft 5 (1797)'},
 {'id_manuscript': 872613, 'title_manuscript': 'Heft 6 (1797)'},
 {'id_manuscript': 872634, 'title_manuscript': 'Heft 7 (1798)'},
 {'id_manuscript': 872738, 'title_manuscript': 'Heft 8 (1798)'},
 {'id_manuscript': 872829, 'title_manuscript': 'Heft 9 (1801)'},
 {'id_manuscript': 872914, 'title_manuscript': 'Heft 10 (1807-1810)'},
 {'id_manuscript': 874182, 'title_manuscript': 'Heft 11 (1810-1815)'},
 {'id_manuscript': 884918, 'title_manuscript': 'Heft 12 (1815-1816)'},
 {'id_manuscript': 891345, 'title_manuscript': 'Heft 13 (1816-1817)'},
 {'id_manuscript': 891868, 'title_manuscript': 'Heft 14 (1816-1817)'},
 {'id_manuscript': 892262, 'title_manuscript': 'Heft 15 (181

### Doublons

Parfois il y a deux textes pour la même section

In [707]:
texts[texts['id_section'] == 871027]

Unnamed: 0,id_manuscript,title_manuscript,id_section,title_section,fk_type,max,id_digital,text_version,id_text,quill_doc,section_type
13,871001,Heft 4 (1796),871027,15.02.1797,,"Section – Name: '15.02.1797', is part of: '(no...",24657454,3,23387970,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'
14,871001,Heft 4 (1796),871027,15.02.1797,,"Section – Name: '15.02.1797', is part of: '(no...",24657457,3,23387973,"{""ops"": [{""insert"": ""["", ""attributes"": {""chari...",'Tagebucheintrag'


In [708]:
### Effectifs sections-textes par cahier
ts = texts.groupby('id_section').size()   ##.count()
print(len(ts))
s = ts[ts > 1]
s

775


id_section
871027    2
884961    3
891460    2
891508    2
892343    2
dtype: int64

## Exemple de texte

In [114]:
data = json.loads(texts_li[0]['quill_doc'])
print(type(data), data)

<class 'dict'> {'ops': [{'insert': '[', 'attributes': {'charid': '2767'}}, {'insert': '2', 'attributes': {'charid': '20'}}, {'insert': '3', 'attributes': {'charid': '21'}}, {'insert': '.', 'attributes': {'charid': '22'}}, {'insert': ' ', 'attributes': {'charid': '23'}}, {'insert': 'N', 'attributes': {'charid': '24'}}, {'insert': 'o', 'attributes': {'charid': '25'}}, {'insert': 'v', 'attributes': {'charid': '26'}}, {'insert': 'e', 'attributes': {'charid': '27'}}, {'insert': 'm', 'attributes': {'charid': '28'}}, {'insert': 'b', 'attributes': {'charid': '29'}}, {'insert': 'e', 'attributes': {'charid': '30'}}, {'insert': 'r', 'attributes': {'charid': '31'}}, {'insert': ' ', 'attributes': {'charid': '32'}}, {'insert': '1', 'attributes': {'charid': '33'}}, {'insert': '7', 'attributes': {'charid': '34'}}, {'insert': '9', 'attributes': {'charid': '35'}}, {'insert': '5', 'attributes': {'charid': '36'}}, {'insert': ']', 'attributes': {'charid': '2765'}}, {'insert': '\n', 'attributes': {'blockid'

In [115]:
insert_l = data['ops']
insert_l[:10]

[{'insert': '[', 'attributes': {'charid': '2767'}},
 {'insert': '2', 'attributes': {'charid': '20'}},
 {'insert': '3', 'attributes': {'charid': '21'}},
 {'insert': '.', 'attributes': {'charid': '22'}},
 {'insert': ' ', 'attributes': {'charid': '23'}},
 {'insert': 'N', 'attributes': {'charid': '24'}},
 {'insert': 'o', 'attributes': {'charid': '25'}},
 {'insert': 'v', 'attributes': {'charid': '26'}},
 {'insert': 'e', 'attributes': {'charid': '27'}},
 {'insert': 'm', 'attributes': {'charid': '28'}}]

In [116]:
type(insert_l)

list

In [117]:
print(''.join([l['insert'] for l in insert_l]))

[23. November 1795]

[3]

95 11 23
Wer mir die für mich bittere Wahrheit 
sagt, ohne den geringsten Eigenuz dabei zu 
haben, der muß Achtung und Freundschaft für mich 
haben, und ich kan ihm die meinige, ohne Un-
gerechtigkeit nicht versagen. Ich wäre der 
Achtung nicht werth wen ich diese Wahrheiten 
nicht scharf zu Herzen nehme und nicht trachtete 
die mir vorgeworfenen Fehler an mir zu 
bessern. Es wird schwer halten, obschon ich 
den besten Willen dazu habe.
Ich hätte einen allzu strengen sittlichen Stolz 
der mich, bei der geringsten Bemerkung eines 
Fehlers an Andern, hindere, das Gute zu bemerken, 
das an ihnen ist, und mich von den 
Menschen zurückzöge, und in Fall setze ungerecht gegen sie
zu sein. Es mag vieles daran wahr sein, 
doch glaube ich daß auch vieles Misverstand dabei 
einlaufe.
Der Anlas war gewiß mein Betragen gegen S. aber 
gewiß ich schätz diesen Menschen finde viel edles 
an ihm. Die Ursache warum ich mich <und die Meinigen> von ihm so viel möglich entferne, si

# Chunks

In [719]:
chunks_f = 'texte/all_chunks_20220116.csv'

In [720]:
chunks = pd.read_csv(chunks_f, sep='#')

print(len(chunks))
chunks.head()

4318


Unnamed: 0,id_manuscript,id_section,id_text,id_entity,entity_label,fk_class,class_label,pk_entity,schema_name,table_name,...,metadata,pk_text,quill_doc,string,fk_text,fk_entity_version,id_for_import,id_for_import_txt,fk_publication_status,fk_license
0,835550,835573,23381914,873964,Selbstkritik,635,Tag,24659470,data,chunk,...,,23390137,"{""ops"": [{""insert"": ""F"", ""attributes"": {""chari...",Fehler an mir zu \nbessern,23381914,6,,,,
1,835550,835573,23381914,873957,Freundschaft,635,Tag,24659471,data,chunk,...,,23390138,"{""ops"": [{""insert"": ""F"", ""attributes"": {""chari...",Freundschaft,23381914,6,,,,
2,835550,835573,23381914,873838,Unbekannte Person,635,Tag,24679027,data,chunk,...,,23411801,"{""ops"": [{""insert"": ""S"", ""attributes"": {""chari...",S.,23381914,8,,,,
3,835550,835573,23381914,874090,Personenbeschreibung,635,Tag,24679028,data,chunk,...,,23411802,"{""ops"": [{""insert"": ""s"", ""attributes"": {""chari...",seine Herkunft,23381914,9,,,,
4,835550,835573,23381914,874226,Gedanken zur Gesellschaft,635,Tag,24679029,data,chunk,...,,23411803,"{""ops"": [{""insert"": ""S"", ""attributes"": {""chari...",Stand,23381914,9,,,,


## Analyse des fréquences

In [149]:
### Fréquence par manuscrit

chunks.groupby('id_manuscript').size()

id_manuscript
835550     22
835618     45
870956      3
871001     75
871145     79
872613      5
872634     30
872738     13
872829     21
872914     98
874182    347
884918    119
891345    107
891868     90
892262     72
897087    202
897353    180
897614    200
897895     10
897928    243
898485    190
906998    299
907422    199
907703    143
907947    194
908256    246
908597    281
908609    222
908620    261
908630    183
908642    139
dtype: int64

In [96]:
### Liste des entités (et non des mentions)
distinct_chunks = chunks.drop_duplicates(['id_entity','entity_label','fk_class','class_label'])[['id_entity','entity_label','fk_class','class_label']]
distinct_chunks[:3]

Unnamed: 0,id_entity,entity_label,fk_class,class_label
0,873964,Selbstkritik,635,Tag
1,873957,Freundschaft,635,Tag
2,873838,Unbekannte Person,635,Tag


### Dictionnaire des entités : distinct_chunks_li



In [721]:
distinct_chunks_li = distinct_chunks.to_dict(orient='records')
distinct_chunks_li[0]

{'id_entity': 884071,
 'entity_label': 'Wetter',
 'fk_class': 635,
 'class_label': 'Tag'}

In [68]:
### Effectifs des classes présentes dans les entités
classes = distinct_chunks.groupby('class_label').size()   ##.count()
print(len(classes))
classes

4


class_label
Geographical Place            3
Person                      284
Serially Produced Source    176
Tag                          98
dtype: int64

In [72]:
### Effectif des individus mentionnées par classe
mention_classes = chunks.groupby('class_label').size()   ##.count()
mention_classes

class_label
Geographical Place            12
Person                      1169
Serially Produced Source     261
Tag                         2876
dtype: int64

In [89]:
### Moyenne mention par individu de classe

cla_men = pd.DataFrame(mention_classes).merge(pd.DataFrame(classes), on = 'class_label')
cla_men.columns =['mentions', 'freq']

def func(x):
    return x[0] / x[1]

cla_men['mean'] = cla_men.apply(func, axis = 1)
cla_men

Unnamed: 0_level_0,mentions,freq,mean
class_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Geographical Place,12,3,4.0
Person,1169,284,4.116197
Serially Produced Source,261,176,1.482955
Tag,2876,98,29.346939


In [90]:
mention_entities = chunks[chunks['class_label'].str.contains("Pla")].groupby('entity_label').size()
print(len(mention_entities))
mention_entities.sort_values(ascending = False).to_dict()

3


{'Basel CH': 10, 'Binningen CH': 1, 'Neuenburg CH': 1}

In [91]:
mention_entities = chunks[chunks['class_label'].str.contains("Tag")].groupby('entity_label').size()
print(len(mention_entities))
mention_entities.sort_values(ascending = False).to_dict()

98


{'Religion': 167,
 'Unbekannte Person': 159,
 'Altersgebrechen': 123,
 'Gedanken über den Tod': 120,
 'Besuch': 120,
 'Alter(n)': 111,
 'Gedanken zur Gesellschaft': 92,
 'Ortserwähnung': 91,
 'Schmerz/Krankheit Familie': 91,
 'Selbstreflexion': 89,
 'Ereignis': 79,
 'Gedanken zur Erziehung': 77,
 'Rückblick': 69,
 'Personenbeschreibung': 67,
 'Mädchenschule?': 63,
 'Kantonstrennung': 63,
 'Geschäfte': 63,
 'Trübe Stimmung': 61,
 'Selbstkritik': 53,
 'Heiterkeit': 53,
 'Selbstzeugnis schreiben': 49,
 'Über das weibliche Geschlecht': 49,
 'Brief': 45,
 'Philosophie': 44,
 'Trennung': 40,
 'Ausbildung': 38,
 'Stadt-Land': 37,
 'Tätigkeiten': 34,
 'Gut St. Apollinaris bei Michelbach': 34,
 'Krankheit': 34,
 'Wetter': 33,
 'Sorgen': 30,
 'Bauvorhaben': 29,
 'Umzug': 28,
 'Trauer': 28,
 'Krankheit anderer Personen': 25,
 'Gedanke zur Politik': 25,
 'Reisen': 25,
 'Gedanken zur Ehe': 24,
 'Unnütz/Untätig': 24,
 'Tod': 22,
 'Freundschaft': 22,
 'Ratschlag an die Kinder': 20,
 'Alter(n) anderer

In [93]:
[r['id_entity'] for i,r in distinct_chunks.iterrows()][10:15]

[874019, 910645, 874029, 835869, 835771]

# TEI document production

## Header

In [709]:
for h in hefte_li[:1]:
    print(h['id_manuscript'], h['title_manuscript'])

835550 Heft 1 (1795)


In [710]:
### Liste des entités (et non des mentions)
distinct_chunks = chunks[chunks['id_manuscript'] == tei_xml_id].drop_duplicates(['id_entity','entity_label','fk_class','class_label'])[['id_entity','entity_label','fk_class','class_label']]
len(distinct_chunks), distinct_chunks[-3:]

(59,
       id_entity                entity_label  fk_class class_label
 4310     922744   Friedrich Hoffmann-Merian        21      Person
 4313     868665  Emanuel Hoffmann-Preiswerk        21      Person
 4317     868701  Juliane Paravicini-Vischer        21      Person)

### Fonction qui prépare la partie listes du header



In [722]:
distinct_chunks_li = distinct_chunks.to_dict(orient='records')
distinct_chunks_li[0]

{'id_entity': 884071,
 'entity_label': 'Wetter',
 'fk_class': 635,
 'class_label': 'Tag'}

In [712]:
def tei_header_lists(chunks, tei_xml_id):
    
    header_result = []
    persons = ''
    objects = ''
    places = ''

    distinct_chunks = chunks[chunks['id_manuscript'] == tei_xml_id]\
        .drop_duplicates(['id_entity','entity_label','fk_class','class_label'])\
        [['id_entity','entity_label','fk_class','class_label']]
        
    
    
    for i,r in distinct_chunks.iterrows():
        if r['fk_class'] == 21:
            hr = '<person xml:id="i' + str(r['id_entity']) + '"><persName>' + \
            r['entity_label'] + '</persName></person>'
            persons += hr
            
        elif r['fk_class'] == 363 :
            hr = '<place xml:id="i' + str(r['id_entity']) + '"><placeName>' + \
            r['entity_label'] + '</placeName></place>'
            places += hr

        else:
            if r['fk_class'] == 635:
                ty = 'tag'
            elif r['fk_class'] == 219:
                ty = 'book_etc'
            else :
                pass    
            hr = '<object xml:id="i' + str(r['id_entity']) + '" type="' + ty + '"><objectIdentifier><objectName>' + \
            r['entity_label'] + '</objectName></objectIdentifier></object>'
            objects += hr
            

    lper = ''
    if len(persons) > 0:
        lper = '<listPerson>' + persons + '</listPerson>'

    lob = ''
    if len(objects) > 0:
        lob = '<listObject>' + objects + '</listObject>'
        
    lpla = ''    
    if len(places) > 0:
        lpla = '<listPlace>' + places + '</listPlace>'
        
    header_result = lper + lob + lpla
    
    return header_result

## Principe

* Pour chaque caractère on vérifie s'il est dans une balise sémantique, puis on applique un traitement spécifique
* Les balises sont fermées correctement mais si l'emboitement n'est pas correct selon l'arboréscence XML le résultat sera XML valide mais non correct par rapport au sens du codage original

In [713]:
### Fonction qui retourne l'intersection entre deux listes, 
#  i.e. les éléments communs
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

### Fonction qui retourne la différence entre deux listes, 
#  i.e. les éléments qui sont dans l'une mais pas dans l'autre, et réciproquement
def difference(lst1, lst2):
    lst3 = list(set(lst1) ^ set(lst2))
    return lst3

In [714]:
difference([5,1,7,9,6], [3,6,9,8,23]),intersection([5,1,7,9,6], [3,6,9,8,23])

([1, 3, 7, 5, 23, 8], [9, 6])

### Fonction qui prépare les divs

In [715]:
p = re.compile('(\[)(.*)(\])')

In [716]:
p1 = re.compile('\[.*\]')

In [541]:
p2 = re.compile('\[\d+\]')

In [542]:
r = p1.findall(d)
r

['[23. November 1795]', '[3]']

In [655]:
d = """
<p>[23. November 1795]</p>
<p>[3]</p>
alpha beta
"""

In [657]:
p2 = re.compile('(<p>\[)(\d+)(\]</p>)')

r = p2.findall(d)

for i in r:
    rep = '<pb n="' + r[0][1]+  '"/>'
    d = d.replace(''.join(i), rep)
    
print(d)    



<pb n="3"/>
alpha beta



In [658]:
p1 = re.compile('<p>\[.{5,25}\]</p>')

r = p1.findall(d)

for i in r:
    print(i)
    d = d.replace(i, '')
    
print(d)    



<pb n="3"/>
alpha beta



In [730]:
def divs_text(tei_xml_id, texts, chunks):
    
    divs = ''
    
    ### Sélection des sections de ce manuscrit
    div_texts_li = texts[texts['id_manuscript'] == tei_xml_id].to_dict(orient='records')
    print(len(div_texts_li))

    
    ### Boucle de traitement de chaque section, un texte = 't'
    for t in div_texts_li:
        
        output_text = ''
        
        sect_title = '<head resp="#editor">'+ t['section_type'].strip(' \'') +'<date> ' + t['title_section'] + ' </date></head>'
        

        
        section_id = t['id_section']
        text_id = t['id_text']
        
        data = json.loads(t['quill_doc'])
        insert_chars = data['ops']
        
        chun_li = chunks[chunks['id_text'] == t['id_text']].to_dict(orient='records')
        new_chunks = [[i['id_entity'], i['entity_label'], i['fk_class'], i['class_label'], \
                       [i[0] for i in[  (list(e['attributes'].values())) \
                                    for e in json.loads(i['quill_doc'])['ops']]]] \
                                   for i in chun_li]


        
        output = []
        tags_l = []

        len_string = 0

        output.append('<p>')

        for l in insert_chars:   #[206:400]:

            pre_char = ''
            post_char =''
            local_tags_l = []

            try:
                a = l['attributes']['charid']
                # print('–––\n', a)

                if a in ['\xa0']:
                    pass
                else:
                    for c in new_chunks:

                        if c[2] == 219:
                            ty = 'book_etc'
                        elif c[2] == 21:
                            ty = 'person'
                        elif c[2] == 363:
                            ty = 'place'    
                        else :
                            ty = 'object'
                            
                            
                        ### si un charactère est dans un chunk et pas encore dans la liste des chuncks
                        # ajouter à la liste et ajouter ouverture élém. 'rs'
                        if a in c[4] and c[0] not in tags_l: 
                            #  print(a + ' : ' + str(c[0])) 
                            pre_char = pre_char + '<rs ref="#i' + str(c[0]) + '" type="'+ ty +'">'
                            tags_l.append(c[0])
                            local_tags_l.append(c[0])
                            #  print('new:', tags_l)
                        elif a in c[4] and c[0] in tags_l and c[0] not in local_tags_l:
                            local_tags_l.append(c[0])
                            pass
                        else:
                            pass

                    ## list difference   
                    diff_l = list(set(tags_l) ^ set(local_tags_l))
                    #  print(diff_l)
                    post_char = '</rs>' * len(diff_l)

                    ## list intersection
                    tags_l = intersection(tags_l, local_tags_l)
                    #  print(tags_l)


                    if l['insert'] == '<' :
                        char = '&lt;'
                    elif l['insert'] == '>' : 
                        char = '&gt;'
                    elif l['insert'] == '&' : 
                        char = '&amp;'    
                    else:
                        char = l['insert']

                    output.append(pre_char + post_char + char)
                    len_string += 1



            except:
                # print('b', l['attributes']['blockid'])
                if len_string > 4 :
                    output.append('<lb/>')  #  l['insert']
                    len_string = 0
                else:
                    if len(tags_l) > 0:
                        output.append('</rs>' * len(tags_l))
                        tags_l = []
                    output.append('</p><p>')
                    len_string = 0


        if len(tags_l) > 0:
            output.append('</rs>' * len(tags_l))
            tags_l = []
        output.append('</p>')    
        
        output_text = '<div xml:id="d' + str(text_id) + '" corresp="d' + str(section_id) + '" >' \
                    + sect_title + (''.join(output).replace('<lb/></p>', '</p>')) + '</div>'
        
       

        divs += output_text.replace('<p></p>','')\
                    .replace('{', '<supplied>').replace('}', '</supplied>')\
                    .replace('&lt;', '<del>').replace('&gt;', '</del>')
        
        
        p2 = re.compile('(<p>\[)(\d+)(\]</p>)')
        r = p2.findall(divs)
        for i in r:
            rep = '<pb n="' + i[1]+  '"/>'
            divs = divs.replace(''.join(i), rep)    

            
        p1 = re.compile('<p>\[.{5,35}\]</p>')
        r = p1.findall(divs)
        for i in r:
            divs = divs.replace(i, '')            
            
        
        try:
            x_doc = etree.fromstring(output_text)
            result_list.append('[' + str(tei_xml_id) + '] ' + str(text_id) + ' : ok !')
            doc_list.append(tei_xml_id)
        except Exception as e:
            result_list.append(str(tei_xml_id) + '–' + str(section_id) + t['title_section'] + " : error !")
            result_list.append(e)
                                              
    return divs
                                              

In [732]:
doc_list = []
result_list = []
for h in hefte_li:  #[4:5]:   [:1]
    
    tei_document = ''
    tei_xml_id = h['id_manuscript']

    
    ##  <?xml version="1.0" encoding="UTF-8"?>
    
    tei_document = """<?xml version="1.0"?>
    <?xml-model href="http://www.tei-c.org/release/xml/tei/custom/schema/relaxng/tei_all.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>    
    <TEI xmlns="http://www.tei-c.org/ns/1.0" xml:id="d""" + str(tei_xml_id) + """">
       <teiHeader>
          <fileDesc>
             <titleStmt>
                <title>""" + h['title_manuscript'] +  """
                </title>
                <author>Anna Maria Preiswerk-Iselin</author>
             </titleStmt>
             <publicationStmt>
                <publisher xml:id="editor">Tagebücher Anna Maria Preiswerk-Iselin Projekt</publisher>
                <pubPlace>Basel</pubPlace>
                <date when="2022-03">March 2021</date>
                <availability>
                   <licence notBefore="2022-03-01" target="http://creativecommons.org/licenses/by-sa/4.0/">
                      <p>[...]</p>
                      <p>This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.</p>
                      <p> You are free to: <list>
                            <item> Share — copy and redistribute the material in any medium or format</item>
                            <item> Adapt — remix, transform, and build upon the material</item>
                         </list>
                      </p>
                      <p> Under the following terms: <list>
                            <item> Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You
                               may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.</item>
                            <item> ShareAlike — If you remix, transform, or build upon the material, you must distribute your contributions under the same
                               license as the original.</item>
                            <item> No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from
                               doing anything the license permits.</item>
                         </list>
                      </p>
                      <p>The licence was added on March first, 2022.</p>
                   </licence>
                </availability>
             </publicationStmt>
             <sourceDesc>
                <msDesc>
                   <msIdentifier>
                      <settlement>Basel</settlement>
                      <repository>Staatsarchiv</repository>
                      <idno>StABS PA 511 304-03-04</idno>
                      <msName>Heft 12 (1815-1816) - Anna Maria Preiswerk-Iselin </msName>
                   </msIdentifier>
                </msDesc>
    """  + tei_header_lists(chunks, tei_xml_id) + """
    </sourceDesc>
      </fileDesc>
      <profileDesc>
         <textClass>
            <!-- vérifier !!! -->
            <catRef scheme="#genre" target="#journal"/>
         </textClass>
         <langUsage>
            <language ident="de">Deutsch</language>
         </langUsage>
      </profileDesc>
   </teiHeader>
       <text><body>""" + divs_text(tei_xml_id, texts, chunks) + """
       </body></text>
   </TEI>
    """
    
    try:
        x_doc = etree.fromstring(tei_document)
        result_list.append(str(tei_xml_id) + ' : -> file ok !')
        doc_list.append(tei_xml_id)
    except Exception as e:
        result_list.append(str(tei_xml_id) + " : -> file error !")
        result_list.append(e)
        
        
    file_name = 'tei_files/text_i' + str(tei_xml_id) + '.xml'
    
    with open(file_name, 'w') as f:
        f.write(tei_document)

        
        
logs = 'texte/logs.txt'
logs_content = '\n'.join(result_list)
with open(logs, 'w') as f:
    f.write(logs_content)

4
7
1
9
19
1
6
4
5
19
65
18
27
23
14
28
30
30
3
31
28
25
45
30
26
33
36
49
49
44
35
37


In [196]:
len(doc_list)

32

# Text

## Chunks to list : chun_li

In [308]:
chun_li = chunks.to_dict(orient='records')
type(chun_li), len(chun_li)

(list, 4318)

In [309]:
chun_li[0]

{'id_manuscript': 835550,
 'id_section': 835573,
 'id_text': 23381914,
 'id_entity': 873964,
 'entity_label': 'Selbstkritik',
 'fk_class': 635,
 'class_label': 'Tag',
 'pk_entity': 24659470,
 'schema_name': 'data',
 'table_name': 'chunk',
 'entity_version': 1,
 'notes': nan,
 'fk_namespace': 24650661,
 'fk_creator': nan,
 'fk_last_modifier': nan,
 'tmsp_creation': '2020-05-05 13:29:02',
 'tmsp_last_modification': '2020-05-05 13:29:02',
 'sys_period': '["2020-05-05 13:29:02.376641+02",)',
 'metadata': nan,
 'pk_text': 23390137,
 'quill_doc': '{"ops": [{"insert": "F", "attributes": {"charid": "809"}}, {"insert": "e", "attributes": {"charid": "810"}}, {"insert": "h", "attributes": {"charid": "811"}}, {"insert": "l", "attributes": {"charid": "812"}}, {"insert": "e", "attributes": {"charid": "813"}}, {"insert": "r", "attributes": {"charid": "814"}}, {"insert": " ", "attributes": {"charid": "815"}}, {"insert": "a", "attributes": {"charid": "816"}}, {"insert": "n", "attributes": {"charid": "8

In [326]:
j = json.loads(chun_li[0]['quill_doc'])
type(j)

dict

In [311]:
j['ops']

[{'insert': 'F', 'attributes': {'charid': '809'}},
 {'insert': 'e', 'attributes': {'charid': '810'}},
 {'insert': 'h', 'attributes': {'charid': '811'}},
 {'insert': 'l', 'attributes': {'charid': '812'}},
 {'insert': 'e', 'attributes': {'charid': '813'}},
 {'insert': 'r', 'attributes': {'charid': '814'}},
 {'insert': ' ', 'attributes': {'charid': '815'}},
 {'insert': 'a', 'attributes': {'charid': '816'}},
 {'insert': 'n', 'attributes': {'charid': '817'}},
 {'insert': ' ', 'attributes': {'charid': '818'}},
 {'insert': 'm', 'attributes': {'charid': '819'}},
 {'insert': 'i', 'attributes': {'charid': '820'}},
 {'insert': 'r', 'attributes': {'charid': '821'}},
 {'insert': ' ', 'attributes': {'charid': '822'}},
 {'insert': 'z', 'attributes': {'charid': '823'}},
 {'insert': 'u', 'attributes': {'charid': '824'}},
 {'insert': '\xa0', 'attributes': {'charid': '825'}},
 {'insert': '\n', 'attributes': {'blockid': '826'}},
 {'insert': 'b', 'attributes': {'charid': '827'}},
 {'insert': 'e', 'attribut

In [386]:
new_chunks = [[i['id_entity'], i['entity_label'], i['fk_class'], i['class_label'], \
                                    [i[0] for i in[  (list(e['attributes'].values())) \
                                    for e in json.loads(i['quill_doc'])['ops']]]] for i in chun_li]
print(new_chunks)

[[873964, 'Selbstkritik', 635, 'Tag', ['809', '810', '811', '812', '813', '814', '815', '816', '817', '818', '819', '820', '821', '822', '823', '824', '825', '826', '827', '828', '829', '830', '831', '832', '833']], [873957, 'Freundschaft', 635, 'Tag', ['579', '580', '581', '582', '583', '584', '585', '586', '587', '588', '589', '590']], [873838, 'Unbekannte Person', 635, 'Tag', ['1296', '1297']], [874090, 'Personenbeschreibung', 635, 'Tag', ['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010']], [874226, 'Gedanken zur Gesellschaft', 635, 'Tag', ['2540', '2541', '2542', '2543', '2544']], [873957, 'Freundschaft', 635, 'Tag', ['153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164']], [873936, 'Philosophie', 635, 'Tag', ['1274', '1275', '1276', '1277', '1278', '1279', '1280', '1281', '1282', '1283', '1284', '1285', '1286', '1287', '1288', '1289']], [873990, 'Selbstreflexion', 635, 'Tag', ['577',

In [383]:
new_chunks[:1]

[[873964,
  'Selbstkritik',
  635,
  'Tag',
  [['809'],
   ['810'],
   ['811'],
   ['812'],
   ['813'],
   ['814'],
   ['815'],
   ['816'],
   ['817'],
   ['818'],
   ['819'],
   ['820'],
   ['821'],
   ['822'],
   ['823'],
   ['824'],
   ['825'],
   ['826'],
   ['827'],
   ['828'],
   ['829'],
   ['830'],
   ['831'],
   ['832'],
   ['833']]]]

In [67]:
insert_l[:300]

[{'insert': '[', 'attributes': {'charid': '11620'}},
 {'insert': '1', 'attributes': {'charid': '11621'}},
 {'insert': '4', 'attributes': {'charid': '11622'}},
 {'insert': '.', 'attributes': {'charid': '11623'}},
 {'insert': ' ', 'attributes': {'charid': '11624'}},
 {'insert': 'M', 'attributes': {'charid': '11625'}},
 {'insert': 'a', 'attributes': {'charid': '11626'}},
 {'insert': 'i', 'attributes': {'charid': '11627'}},
 {'insert': ' ', 'attributes': {'charid': '11628'}},
 {'insert': '1', 'attributes': {'charid': '11629'}},
 {'insert': '8', 'attributes': {'charid': '11630'}},
 {'insert': '1', 'attributes': {'charid': '11631'}},
 {'insert': '5', 'attributes': {'charid': '11632'}},
 {'insert': ']', 'attributes': {'charid': '11633'}},
 {'insert': '\n', 'attributes': {'blockid': '11618'}},
 {'insert': '\n', 'attributes': {'blockid': '11619'}},
 {'insert': '[', 'attributes': {'charid': '9'}},
 {'insert': '1', 'attributes': {'charid': '10'}},
 {'insert': ']', 'attributes': {'charid': '11'}},

In [73]:
output = []
tags_l = []

len_string = 0

output.append('<p>')

for l in insert_l:   #[206:400]:
    
    pre_char = ''
    post_char =''
    local_tags_l = []
    
    try:
        a = l['attributes']['charid']
        print('–––\n', a)
        
        if a in ['\xa0']:
            pass
        else:
            for c in new_chunks:
                
                if c[2] == 219:
                    ty = 'book_etc'
                elif c[2] == 21:
                    ty = 'person'
                elif c[2] == 363:
                    ty = 'place'    
                else :
                    ty = 'object'

                if a in c[4] and c[0] not in tags_l: # and stop == 0:
                    print(a + ' : ' + str(c[0])) 
                    pre_char = pre_char + '<rs ref="#i' + str(c[0]) + '" type="'+ ty +'">'
                    tags_l.append(c[0])
                    local_tags_l.append(c[0])
                    print('new:', tags_l)
                elif a in c[4] and c[0] in tags_l and c[0] not in local_tags_l:
                    local_tags_l.append(c[0])
                    pass
                else:
                    pass

            ## list difference   
            diff_l = list(set(tags_l) ^ set(local_tags_l))
            print(diff_l)
            post_char = '</rs>' * len(diff_l)

            ## list intersection
            tags_l = intersection(tags_l, local_tags_l)
            print(tags_l)


            if l['insert'] == '<' :
                char = '&lt;'
            elif l['insert'] == '>' : 
                char = '&lt;'
            else:
                char = l['insert']

            output.append(pre_char + post_char + char)
            len_string += 1

    
                
    except:
        print('b', l['attributes']['blockid'])
        if len_string > 4 :
            output.append('<lb/>')  #  l['insert']
            len_string = 0
        else:
            output.append('</p><p>')
            len_string = 0

    
output.append('</p>')    
    
    

–––
 11620
[]
[]
–––
 11621
[]
[]
–––
 11622
[]
[]
–––
 11623
[]
[]
–––
 11624
[]
[]
–––
 11625
[]
[]
–––
 11626
[]
[]
–––
 11627
[]
[]
–––
 11628
[]
[]
–––
 11629
[]
[]
–––
 11630
[]
[]
–––
 11631
[]
[]
–––
 11632
[]
[]
–––
 11633
[]
[]
b 11618
b 11619
–––
 9
[]
[]
–––
 10
[]
[]
–––
 11
[]
[]
b 12
–––
 15
[]
[]
b 16
–––
 1968
[]
[]
–––
 11599
[]
[]
–––
 11600
[]
[]
–––
 11601
[]
[]
–––
 11602
[]
[]
–––
 11603
[]
[]
–––
 11604
[]
[]
–––
 11605
[]
[]
–––
 11606
[]
[]
–––
 11607
[]
[]
–––
 11608
[]
[]
–––
 11609
[]
[]
–––
 11610
[]
[]
–––
 11611
[]
[]
b 11613
–––
 11614
[]
[]
–––
 1969
[]
[]
–––
 1970
[]
[]
–––
 1971
[]
[]
–––
 1972
[]
[]
–––
 1973
[]
[]
–––
 1974
[]
[]
–––
 1975
[]
[]
–––
 1976
[]
[]
–––
 1977
[]
[]
–––
 1978
[]
[]
–––
 1979
[]
[]
–––
 1980
[]
[]
–––
 1981
[]
[]
–––
 1982
[]
[]
–––
 1983
[]
[]
–––
 1984
[]
[]
–––
 1985
[]
[]
–––
 1986
[]
[]
–––
 1987
[]
[]
–––
 1988
[]
[]
–––
 1989
[]
[]
–––
 1990
[]
[]
–––
 1991
[]
[]
–––
 1992
[]
[]
–––
 1993
[]
[]
–––
 1994
[]
[]
–––

In [74]:
print(''.join(output).replace('<lb/></p>', '</p>'))

<p>[14. Mai 1815]</p><p>[1]</p><p> </p><p>1815 Mai d 14 <lb/>Ich wiedersprach letzthin Jemand über eine Sache die<lb/>er glaubte im Krieg erlaubt zu seyn. Nachher<lb/>überdachte ichs nächer u ich sagt zu mir, sollte es dan<lb/>unter gesitteten <rs ref="#i884031" type="object"><rs ref="#i1966372" type="object">Nationen</rs> auch im Krieg</rs> erlaubt seyn<lb/>wehe zu thun ohne einigen Nutzen? Kaum hatte ich diesen<lb/>Gedanken heraus als ich etwas so Mistönendes<lb/>Anstössiges darin fand; ich forschte nach u fand daß<lb/>Krieg – gesittet, nicht könnten zusamen passen<lb/>es schien mir es wäre als ob ich sagte, 2 gesittete<lb/>Herren hätten sich auf der Gaße herumgerauft.<lb/>Ist der Krieg im Grund was Andres als das Rauffen<lb/>einer Menge gegen eine Menge? Sollte eine<lb/>Sache gut u recht werden weil nicht einzelne<lb/>sondern Viele sie thun? Wenn man recht nachdächte<lb/>wie viele Dinge gäben es noch welche durch alte<lb/>Gewohnheit elende Vorurtheile ausgerüstet, die<lb/>gedankenlo

## Question importante

Implémenter cette fonction en PL/pgSQL ou Python dans PostgreSQL pour produire le texte de sortie directement sur le serveur, dans une fonction PL/pgSQL par exemple, renvoyant directement le texte XML/TEI