# Animacy in German Folktales

This notebook contains the reproducible code examples and analyses for the paper *"Animacy in German Folktales"* submitted in proceedings of CHR 2024: Computational Humanities Research Conference, 2024, Aarhus, Denmark.

**Authors:** Julian Häußler, Janis von Keitz, Evelyn Gius

**Institution:** *fortext lab, Technical University of Darmstadt, Germany*

**Reference:** Häußler, J., von Keitz, J., Gius, E. (2024). *Animacy in German Folktales*. CHR 2024: Computational Humanities Research Conference, December 4 – 6, 2024, Aarhus, Denmark. https://ceur-ws.org/Vol-3834/paper90.pdf.

**GitHub Repository:** https://github.com/forTEXT/Animacy_in_German_Folktales

## Notebook 05: Comparison NER and POS

### NER

In [None]:
# Import libraries

import json
import os
import pandas as pd

In [None]:
# Load data

input_folder = '../Data/annotations'

In [3]:
texts_as_dfs = []
lst_khm_ids = []

In [4]:
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)

    lst_khm_ids.append(int(filename.replace('_annotations.json', '')))

    with open(file_path, 'r', encoding='utf-8') as json_file:
        text_file = json.load(json_file)
        df_text = pd.DataFrame(text_file)
        texts_as_dfs.append(df_text)

In [5]:
lst_khm_ids

[103, 10, 11, 142, 171, 188, 18, 24, 28, 30, 36, 41, 42, 49, 56, 6, 80, 88, 89]

In [6]:
texts_as_dfs[0].head(10)

Unnamed: 0,tokens,animate_tags,lemmas,pos_tags,dep_tags,ner_tags,start_chars,end_chars
0,Der,inanimate,der,DET,det,O,0,3
1,süße,inanimate,süß,ADJ,amod,O,4,8
2,Brei,inanimate,Brei,NOUN,root,O,9,13
3,Brüder,inanimate,Bruder,NOUN,root,O,21,27
4,Grimm,inanimate,Grimm,PROPN,appos,S-PER,28,33
5,Dieterich,inanimate,Dieterich,PROPN,root,S-PER,39,48
6,",",inanimate,",",PUNCT,punct,O,48,49
7,Göttingen,inanimate,Göttingen,PROPN,conj,S-LOC,50,59
8,",",inanimate,",",PUNCT,punct,O,59,60
9,1857,inanimate,1857,NUM,nmod,O,61,65


In [7]:
ner_pers_tags = ['B-PER','I-PER','O-PER','E-PER','S-PER']

In [None]:
# Get share of person entities and animated entities per folktale

df_entity_counts = pd.DataFrame(columns=['khm_id','abs_freq_pers','rel_freq_pers','abs_freq_animate','rel_freq_animate'])

for i in range(0,len(texts_as_dfs)):
    df_entity_counts.at[i,'khm_id'] = lst_khm_ids[i]

    df = texts_as_dfs[i]

    ner_counts = df['ner_tags'].value_counts()

    ner_abs = 0
    for tag in ner_pers_tags:
        if tag in ner_counts:
            ner_abs+=ner_counts[tag]
    ner_rel = ner_abs/len(df)

    df_entity_counts.at[i,'abs_freq_pers'] = ner_abs
    df_entity_counts.at[i,'rel_freq_pers'] = ner_rel

    animate_abs = df['animate_tags'].value_counts()['animate']
    animate_rel = animate_abs/len(df)

    df_entity_counts.at[i,'abs_freq_animate'] = animate_abs
    df_entity_counts.at[i,'rel_freq_animate'] = animate_rel

In [9]:
df_entity_counts

Unnamed: 0,khm_id,abs_freq_pers,rel_freq_pers,abs_freq_animate,rel_freq_animate
0,103,2,0.00409,49,0.100204
1,10,6,0.00553,144,0.132719
2,11,8,0.002745,570,0.195607
3,142,6,0.005396,208,0.18705
4,171,11,0.007309,224,0.148837
5,188,4,0.002884,196,0.141312
6,18,4,0.005019,97,0.121706
7,24,18,0.011842,236,0.155263
8,28,4,0.003895,203,0.197663
9,30,17,0.024638,129,0.186957


In [10]:
for index, row in df_entity_counts.iterrows():
    diff = row['rel_freq_pers']-row['rel_freq_animate']
    df_entity_counts.at[index,'difference'] = diff

In [11]:
df_entity_counts

Unnamed: 0,khm_id,abs_freq_pers,rel_freq_pers,abs_freq_animate,rel_freq_animate,difference
0,103,2,0.00409,49,0.100204,-0.096115
1,10,6,0.00553,144,0.132719,-0.127189
2,11,8,0.002745,570,0.195607,-0.192862
3,142,6,0.005396,208,0.18705,-0.181655
4,171,11,0.007309,224,0.148837,-0.141528
5,188,4,0.002884,196,0.141312,-0.138428
6,18,4,0.005019,97,0.121706,-0.116688
7,24,18,0.011842,236,0.155263,-0.143421
8,28,4,0.003895,203,0.197663,-0.193768
9,30,17,0.024638,129,0.186957,-0.162319


In [12]:
df_entity_counts.sort_values('difference')

Unnamed: 0,khm_id,abs_freq_pers,rel_freq_pers,abs_freq_animate,rel_freq_animate,difference
13,49,6,0.002521,476,0.2,-0.197479
8,28,4,0.003895,203,0.197663,-0.193768
2,11,8,0.002745,570,0.195607,-0.192862
18,89,22,0.008658,505,0.198741,-0.190083
17,88,5,0.001783,536,0.191155,-0.189372
3,142,6,0.005396,208,0.18705,-0.181655
14,56,25,0.013528,347,0.187771,-0.174242
11,41,11,0.017323,116,0.182677,-0.165354
10,36,71,0.01555,821,0.179807,-0.164258
9,30,17,0.024638,129,0.186957,-0.162319


In [13]:
mean = df_entity_counts['difference'].mean()
mean

-0.15990811764411736

In [None]:
# Save df

df_entity_counts.to_csv('../Data/csv/df_frequency_counts_person_animate_entities.csv')


In [None]:
# Get entities that either person or animate entity or both

df_entity_tokens = pd.DataFrame(columns=['khm_id','left_context','token','right_context','pers_tag','animate_tag'])

j=0

for i in range(0,len(texts_as_dfs)):
    
    khm_id = df_entity_counts.at[i,'khm_id']

    df = texts_as_dfs[i]

    for index,row in df.iterrows():
        if row['ner_tags'].endswith('PER') or row['animate_tags'] == 'animate':
            df_entity_tokens.at[j,'khm_id'] = khm_id
            df_entity_tokens.at[j,'token'] = row['tokens']
            df_entity_tokens.at[j,'pers_tag'] = row['ner_tags']
            df_entity_tokens.at[j,'animate_tag'] = row['animate_tags']

            left_context = []
            right_context = []

            for k in list(range(int(index)-5,int(index))):
                if k < 0:
                    left_context.append('None')
                else:
                    left_context.append(df.iloc[k]['tokens'])

            df_entity_tokens.at[j,'left_context'] = left_context

            for k in list(range(int(index)+1,int(index)+6)):
                if k < len(df):
                    right_context.append(df.iloc[k]['tokens'])
                else:
                    right_context.append('None')

            df_entity_tokens.at[j,'right_context'] = right_context

            j+=1

In [15]:
df_entity_tokens.head()

Unnamed: 0,khm_id,left_context,token,right_context,pers_tag,animate_tag
0,103,"[None, Der, süße, Brei, Brüder]",Grimm,"[Dieterich, ,, Göttingen, ,, 1857]",S-PER,inanimate
1,103,"[Der, süße, Brei, Brüder, Grimm]",Dieterich,"[,, Göttingen, ,, 1857, Exportiert]",S-PER,inanimate
2,103,"[Brei, ., Es, war, einmal]",ein,"[armes, frommes, Mädchen, ,, das]",O,animate
3,103,"[., Es, war, einmal, ein]",armes,"[frommes, Mädchen, ,, das, lebte]",O,animate
4,103,"[Es, war, einmal, ein, armes]",frommes,"[Mädchen, ,, das, lebte, mit]",O,animate


In [20]:
df_entity_tokens.head(50)

Unnamed: 0,khm_id,left_context,token,right_context,pers_tag,animate_tag
0,103,"[None, Der, süße, Brei, Brüder]",Grimm,"[Dieterich, ,, Göttingen, ,, 1857]",S-PER,inanimate
1,103,"[Der, süße, Brei, Brüder, Grimm]",Dieterich,"[,, Göttingen, ,, 1857, Exportiert]",S-PER,inanimate
2,103,"[Brei, ., Es, war, einmal]",ein,"[armes, frommes, Mädchen, ,, das]",O,animate
3,103,"[., Es, war, einmal, ein]",armes,"[frommes, Mädchen, ,, das, lebte]",O,animate
4,103,"[Es, war, einmal, ein, armes]",frommes,"[Mädchen, ,, das, lebte, mit]",O,animate
5,103,"[war, einmal, ein, armes, frommes]",Mädchen,"[,, das, lebte, mit, seiner]",O,animate
6,103,"[ein, armes, frommes, Mädchen, ,]",das,"[lebte, mit, seiner, Mutter, allein]",O,animate
7,103,"[Mädchen, ,, das, lebte, mit]",seiner,"[Mutter, allein, ,, und, sie]",O,animate
8,103,"[,, das, lebte, mit, seiner]",Mutter,"[allein, ,, und, sie, hatten]",O,animate
9,103,"[seiner, Mutter, allein, ,, und]",sie,"[hatten, nichts, mehr, zu, essen]",O,animate


In [21]:
df_entity_tokens.tail()

Unnamed: 0,khm_id,left_context,token,right_context,pers_tag,animate_tag
5882,89,"[sich, der, junge, König, mit]",seiner,"[rechten, Gemahlin, ,, und, beide]",O,animate
5883,89,"[der, junge, König, mit, seiner]",rechten,"[Gemahlin, ,, und, beide, beherrschten]",O,animate
5884,89,"[junge, König, mit, seiner, rechten]",Gemahlin,"[,, und, beide, beherrschten, ihr]",O,animate
5885,89,"[seiner, rechten, Gemahlin, ,, und]",beide,"[beherrschten, ihr, Reich, in, Frieden]",O,animate
5886,89,"[Gemahlin, ,, und, beide, beherrschten]",ihr,"[Reich, in, Frieden, und, Seligkeit]",O,animate


In [None]:
# Save df

df_entity_tokens.to_csv('../Data/csv/df_person_animate_tokens_all.csv', encoding='utf-8-sig', sep ='\t')

In [24]:
ner_pers_tags

['B-PER', 'I-PER', 'O-PER', 'E-PER', 'S-PER']

In [None]:
# Measure overlap

i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in ner_pers_tags and animate_tag=='animate':
        i+=1

print('person entity and animate entity: ',i)

person entity and animate entity:  173


In [29]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in ner_pers_tags and animate_tag=='inanimate':
        i+=1

print('person entity and inanimate entity: ',i)

person entity and inanimate entity:  106


In [30]:

i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag not in ner_pers_tags and animate_tag=='animate':
        i+=1

print('other entity/no entity and animate entity: ',i)

other entity/no entity and animate entity:  5608


In [31]:

i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag not in ner_pers_tags and animate_tag=='inanimate':
        i+=1

print('other entity/no entity and inanimate entity: ',i)

other entity/no entity and inanimate entity:  0


In [32]:
lst_entities = df_entity_tokens['pers_tag'].to_list()

In [33]:
set(lst_entities)

{'B-LOC', 'B-PER', 'E-LOC', 'E-PER', 'O', 'S-LOC', 'S-PER'}

In [34]:
ner_other_tags = ['B-LOC', 'E-LOC', 'S-LOC']

In [35]:

i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in ner_other_tags and animate_tag=='animate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  20


In [36]:

i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in ner_other_tags and animate_tag=='inanimate':
        i+=1

print('other entity and inanimate entity: ',i)

other entity and inanimate entity:  0


In [37]:
lst_ner_all = ner_pers_tags.copy()
lst_ner_all.extend(ner_other_tags)
lst_ner_all

['B-PER', 'I-PER', 'O-PER', 'E-PER', 'S-PER', 'B-LOC', 'E-LOC', 'S-LOC']

In [38]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in lst_ner_all and animate_tag=='animate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  193


In [39]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag in lst_ner_all and animate_tag=='inanimate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  106


In [40]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag not in lst_ner_all and animate_tag=='animate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  5588


In [41]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag == 'O' and animate_tag=='animate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  5588


In [42]:
i = 0

for indes, row in df_entity_tokens.iterrows():
    pers_tag = row['pers_tag']
    animate_tag = row['animate_tag']

    if pers_tag not in lst_ner_all and animate_tag=='inanimate':
        i+=1

print('other entity and animate entity: ',i)

other entity and animate entity:  0


### POS

In [None]:
## Load data, linguistic features and labels

# Read text data from files
input_folder = '../Data/annotations'

In [None]:
texts_khm_id = []
texts_tokens = []
texts_lemmas = []
texts_pos_tags = []
texts_dep_tags = []
texts_ner_tags = []
texts_animate_labels = []

In [None]:
for filename in os.listdir(input_folder):
    file_path = os.path.join(input_folder, filename)
    with open(file_path, 'r', encoding='utf-8') as json_file:
        text_file = json.load(json_file)
        df_text = pd.DataFrame(text_file)
        texts_khm_id.append(filename.replace('_annotations.json', ''))
        texts_tokens.append(df_text['tokens'].to_list())
        texts_lemmas.append(df_text['lemmas'].to_list())
        texts_pos_tags.append(df_text['pos_tags'].to_list())
        texts_dep_tags.append(df_text['dep_tags'].to_list())
        texts_ner_tags.append(df_text['ner_tags'].to_list())
        texts_animate_labels.append(df_text['animate_tags'].to_list())


In [None]:
i = 0
for text in texts_tokens:
    for token in text:
        i+=1
print(i)

In [None]:
i = 0
for text in texts_pos_tags:
    for pos in text:
        i+=1
print(i)

In [None]:
df = pd.DataFrame(columns=['token','pos_tag','animate_label'])

In [None]:
tokens = [token for text in texts_tokens for token in text]

In [None]:
len(tokens)

In [None]:
pos_tags = [pos_tag for text in texts_pos_tags for pos_tag in text]

In [None]:
animate_labels = [label for text in texts_animate_labels for label in text]

In [None]:
count_animate = animate_labels.count('animate')
count_animate

In [None]:
count_inanimate = animate_labels.count('inanimate')
count_inanimate

In [None]:
df['token']=tokens
df['pos_tag']=pos_tags
df['animate_label']=animate_labels

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df.head(20)

In [None]:
set(df['pos_tag'].to_list())

In [None]:
noun_tags = ['NOUN','PROPN']

In [None]:
i = 0

for index,row in df.iterrows():
    if row['pos_tag'] in noun_tags and row['animate_label']=='animate':
        i+=1

print('absolute frequency: ',i,', percentage: ',i/count_animate*100)

In [None]:
i = 0

for index,row in df.iterrows():
    if row['pos_tag'] in noun_tags and row['animate_label']=='animate':
        i+=1

print('absolute frequency: ',i,', percentage: ',i/count_inanimate*100)

In [None]:
# Excluding punctuation

i=0

for index,row in df.iterrows():
    if row['pos_tag'] == 'X':
        print(row['token'])
        i+=1
i

In [None]:
freq_punct = 0

for index,row in df.iterrows():
    if row['pos_tag'] == 'PUNCT':
        freq_punct+=1
freq_punct

In [None]:
freq_punct/len(df)*100