# Animacy in German Folktales

This notebook contains the reproducible code examples and analyses for the paper *"Animacy in German Folktales"* submitted in proceedings of CHR 2024: Computational Humanities Research Conference, 2024, Aarhus, Denmark.

**Authors:** Julian Häußler, Janis von Keitz, Evelyn Gius

**Institution:** *fortext lab, Technical University of Darmstadt, Germany*

**Reference:** Häußler, J., von Keitz, J., Gius, E. (2024). *Animacy in German Folktales*. CHR 2024: Computational Humanities Research Conference, December 4 – 6, 2024, Aarhus, Denmark. https://ceur-ws.org/Vol-3834/paper90.pdf.

**GitHub Repository:** https://github.com/forTEXT/Animacy_in_German_Folktales

## Notebook 06: Grimm NER and Evaluation

In [None]:
# Import libraries

import pandas as pd
import os 
import json
import re

In [2]:
import stanza

nlp = stanza.Pipeline(lang='de', processors='tokenize,ner', tokenize_pretokenized=True)

  from .autonotebook import tqdm as notebook_tqdm
2024-07-10 22:25:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 384kB [00:00, ?B/s]                        
2024-07-10 22:25:19 INFO: Downloaded file to C:\Users\haeus\stanza_resources\resources.json
2024-07-10 22:25:20 INFO: Loading these models for language: de (German):
| Processor | Package      |
----------------------------
| tokenize  | gsd          |
| ner       | germeval2014 |

2024-07-10 22:25:20 INFO: Using device: cpu
2024-07-10 22:25:20 INFO: Loading: tokenize
2024-07-10 22:25:20 INFO: Loading: ner
2024-07-10 22:25:23 INFO: Done loading processors!


In [3]:
from nltk.tokenize import word_tokenize

In [4]:
from nltk.tokenize import sent_tokenize

In [None]:
# Import data

input_folder = '../Data/grimm_corpus'
texts = []
text_names = []
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        sents = sent_tokenize(text)
        tokens = [word_tokenize(sent) for sent in sents]
        texts.append(tokens)
        text_names.append(filename.replace('.txt', '').strip())

In [6]:
len(texts)

211

In [8]:
texts[0][:3]

[['Allerleirauh', '.'],
 ['Es',
  'war',
  'einmal',
  'ein',
  'König',
  ',',
  'der',
  'hatte',
  'eine',
  'Frau',
  'mit',
  'goldenen',
  'Haaren',
  ',',
  'und',
  'sie',
  'war',
  'so',
  'schön',
  ',',
  'daß',
  'sich',
  'ihres',
  'Gleichen',
  'nicht',
  'mehr',
  'auf',
  'Erden',
  'fand',
  '.'],
 ['Es',
  'geschah',
  ',',
  'daß',
  'sie',
  'krank',
  'lag',
  ',',
  'und',
  'als',
  'sie',
  'fühlte',
  'daß',
  'sie',
  'bald',
  'sterben',
  'würde',
  ',',
  'rief',
  'sie',
  'den',
  'König',
  'und',
  'sprach',
  '„',
  'wenn',
  'du',
  'nach',
  'meinem',
  'Tode',
  'dich',
  'wieder',
  'vermählen',
  'willst',
  ',',
  'so',
  'nimm',
  'keine',
  ',',
  'die',
  'nicht',
  'eben',
  'so',
  'schön',
  'ist',
  ',',
  'als',
  'ich',
  'bin',
  ',',
  'und',
  'die',
  'nicht',
  'solche',
  'goldene',
  'Haare',
  'hat',
  ',',
  'wie',
  'ich',
  'habe',
  ';',
  'das',
  'mußt',
  'du',
  'mir',
  'versprechen.',
  '“',
  'Nachdem',
  'es',
  'ih

In [9]:
len(texts[0])

57

In [None]:
# Perform NER

texts_ner_tags = []

for text in texts:
    doc = nlp(text)
    text_ner_tags = []

    for i, sentence in enumerate(doc.sentences):
        sent = [token.ner for token in sentence.tokens]
        text_ner_tags.append(sent)
    
    texts_ner_tags.append(text_ner_tags)

In [11]:
len(texts_ner_tags)

211

In [12]:
texts_ner_tags[0][:3]

[['S-PER', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [13]:
len(texts_ner_tags[0])

57

In [14]:
len(texts_ner_tags[1])

17

In [15]:
len(texts[0])

57

In [16]:
len(texts[1])

17

In [None]:
# Create dataframe

ner_pers_tags = ['B-PER','I-PER','O-PER','E-PER','S-PER']

In [31]:
df_grimm_ner = pd.DataFrame(columns=['title', 'absolute_PER_frequency', 'relative_PER_frequency'])

In [32]:
len(text_names)

211

In [33]:
df_grimm_ner.head()

Unnamed: 0,title,absolute_PER_frequency,relative_PER_frequency


In [46]:
for i in range(0,len(texts_ner_tags)):
    df_grimm_ner.at[i,'title'] = text_names[i]
    
    tags = [tag for sent in texts_ner_tags[i] for tag in sent]

    abs_freq = 0

    for tag in ner_pers_tags:
        abs_freq += tags.count(tag)

    df_grimm_ner.at[i,'absolute_PER_frequency'] = abs_freq

    df_grimm_ner.at[i,'relative_PER_frequency'] = abs_freq/len(texts_ner_tags[i])

In [47]:
df_grimm_ner.head()

Unnamed: 0,title,absolute_PER_frequency,relative_PER_frequency
0,Allerleirauh (1857),17,0.298246
1,Armuth und Demuth führen zum Himmel (1857),4,0.235294
2,Aschenputtel (1857),9,0.12
3,Bruder Lustig (1857),106,0.726027
4,Brüderchen und Schwesterchen (1857),7,0.111111


In [48]:
df_grimm_ner.tail()

Unnamed: 0,title,absolute_PER_frequency,relative_PER_frequency
206,Vom klugen Schneiderlein (1857),4,0.125
207,Von dem Fischer un syner Fru (1857),49,0.653333
208,Von dem Machandelboom (1857),54,0.830769
209,"Von dem Mäuschen, Vögelchen und der Bratwurst ...",3,0.130435
210,Von dem Tode des Hühnchens (1857),1,0.066667


In [None]:
# Load data

input_folder = '../Data'

In [50]:
df_grimm_animacy = pd.read_csv(os.path.join(input_folder,'grimm_corpus_animacy.csv'),index_col=0)

In [51]:
df_grimm_animacy.head()

Unnamed: 0,title,absolute_animacy_frequency,relative_animacy_frequency
0,Allerleirauh (1857),392,0.167808
1,Armuth und Demuth führen zum Himmel (1857),84,0.161228
2,Aschenputtel (1857),472,0.166902
3,Bruder Lustig (1857),769,0.162786
4,Brüderchen und Schwesterchen (1857),565,0.214096


In [52]:
df_grimm_animacy.tail()

Unnamed: 0,title,absolute_animacy_frequency,relative_animacy_frequency
206,Vom klugen Schneiderlein (1857),206,0.15969
207,Von dem Fischer un syner Fru (1857),210,0.055263
208,Von dem Machandelboom (1857),173,0.046406
209,"Von dem Mäuschen, Vögelchen und der Bratwurst ...",67,0.101979
210,Von dem Tode des Hühnchens (1857),149,0.209859


In [None]:
# Measure correllation

lst_rel_PER = df_grimm_ner['relative_PER_frequency'].to_list()

lst_rel_animacy = df_grimm_animacy['relative_animacy_frequency'].to_list()


In [55]:
len(lst_rel_PER)

211

In [56]:
len(lst_rel_animacy)

211

In [57]:
from scipy.stats import pearsonr

In [58]:
corr, _ = pearsonr(lst_rel_PER, lst_rel_animacy)
print('Pearsons correlation: %.3f' % corr)

Pearsons correlation: -0.132


In [59]:
from scipy.stats import spearmanr

In [60]:
corr, _ = spearmanr(lst_rel_PER, lst_rel_animacy)
print('Spearmans correlation: %.3f' % corr)

Spearmans correlation: -0.195


In [None]:
# Save data

import json

In [None]:
texts_token_ner = []

for i in range(0,len(texts)):
    text_tokens = texts[i]
    text_ner_tags = texts_ner_tags[i]

    text_tokens_ner = []

    for j in range(0,len(text_tokens)):
        s
        for k in r

In [62]:
zipped_texts = [
    [
        [[token, tag] for token, tag in zip(sentence, tag_sentence)]
        for sentence, tag_sentence in zip(text, tag_text)
    ]
    for text, tag_text in zip(texts, texts_ner_tags)
]

In [63]:
len(zipped_texts)

211

In [65]:
zipped_texts[0][:3]

[[['Allerleirauh', 'S-PER'], ['.', 'O']],
 [['Es', 'O'],
  ['war', 'O'],
  ['einmal', 'O'],
  ['ein', 'O'],
  ['König', 'O'],
  [',', 'O'],
  ['der', 'O'],
  ['hatte', 'O'],
  ['eine', 'O'],
  ['Frau', 'O'],
  ['mit', 'O'],
  ['goldenen', 'O'],
  ['Haaren', 'O'],
  [',', 'O'],
  ['und', 'O'],
  ['sie', 'O'],
  ['war', 'O'],
  ['so', 'O'],
  ['schön', 'O'],
  [',', 'O'],
  ['daß', 'O'],
  ['sich', 'O'],
  ['ihres', 'O'],
  ['Gleichen', 'O'],
  ['nicht', 'O'],
  ['mehr', 'O'],
  ['auf', 'O'],
  ['Erden', 'O'],
  ['fand', 'O'],
  ['.', 'O']],
 [['Es', 'O'],
  ['geschah', 'O'],
  [',', 'O'],
  ['daß', 'O'],
  ['sie', 'O'],
  ['krank', 'O'],
  ['lag', 'O'],
  [',', 'O'],
  ['und', 'O'],
  ['als', 'O'],
  ['sie', 'O'],
  ['fühlte', 'O'],
  ['daß', 'O'],
  ['sie', 'O'],
  ['bald', 'O'],
  ['sterben', 'O'],
  ['würde', 'O'],
  [',', 'O'],
  ['rief', 'O'],
  ['sie', 'O'],
  ['den', 'O'],
  ['König', 'O'],
  ['und', 'O'],
  ['sprach', 'O'],
  ['„', 'O'],
  ['wenn', 'O'],
  ['du', 'O'],
  ['nach'

In [66]:
len(zipped_texts[0])

57

In [68]:
for i in range(0,len(text_names)):

    filepath = f'../Data/grimm_corpus_annotations_ner/{text_names[i]}_annotated_ner.json'

    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(zipped_texts[i], f, ensure_ascii=False)

In [69]:
df_grimm_ner.to_csv('../Data/grimm_corpus_ner.csv', encoding='utf-8-sig')