# Animacy in German Folktales
    
This notebook contains the reproducible code examples and analyses for the paper *\"Animacy in German Folktales\"* submitted in proceedings of CHR 2024: Computational Humanities Research Conference, 2024, Aarhus, Denmark.
    
**Authors:** Julian Häußler, Janis von Keitz, Evelyn Gius
    
**Institution:** *fortext lab, Technical University of Darmstadt, Germany*
    
**Reference:** Häußler, J., von Keitz, J., Gius, E. (2024). *Animacy in German Folktales*. CHR 2024: Computational Humanities Research Conference, December 4 – 6, 2024, Aarhus, Denmark. https://ceur-ws.org/Vol-3834/paper90.pdf.
    
**GitHub Repository:** https://github.com/forTEXT/Animacy_in_German_Folktales

## Notebook 02: Word Embedding Model

In [1]:
# imports

import glob
import os
import pickle
import re
from gensim.models import Word2Vec

In [2]:
# read in corpora

lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Kernkorpus', "*.pkl"))

lst_novels_Romantik_Kernkorpus_lemmatized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_final.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_Kernkorpus_lemmatized.append(pickle.load(f))

In [3]:
lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Ergänzung I', "*.pkl"))

lst_novels_Romantik_I_lemmatized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_final.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_I_lemmatized.append(pickle.load(f))

In [4]:
lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Ergänzung II', "*.pkl"))

lst_novels_Romantik_II_lemmatized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_final.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_II_lemmatized.append(pickle.load(f))

In [5]:
lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Kernkorpus', "*.pkl"))

lst_novels_Romantik_Kernkorpus_tokenized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_sents.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_Kernkorpus_tokenized.append(pickle.load(f))

In [6]:
lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Ergänzung I', "*.pkl"))

lst_novels_Romantik_I_tokenized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_sents.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_I_tokenized.append(pickle.load(f))

In [7]:
lst_files = glob.glob(os.path.join(os.getcwd(), r'../../Projekte/Konflikte/Analyseergebnisse/pickled/sentences/Romantik Ergänzung II', "*.pkl"))

lst_novels_Romantik_II_tokenized = []

for i in range(0,len(lst_files)):
    if lst_files[i].endswith("_sents.pkl"):
        with open(lst_files[i], 'rb') as f:
            lst_novels_Romantik_II_tokenized.append(pickle.load(f))

In [8]:
# create lists of sentences

lst_sents_Romantik_Kernkorpus_lemmatized = [sent for novel in lst_novels_Romantik_Kernkorpus_lemmatized for sent in novel]

In [9]:
lst_sents_Romantik_I_lemmatized = [sent for novel in lst_novels_Romantik_I_lemmatized for sent in novel]

In [10]:
lst_sents_Romantik_II_lemmatized = [sent for novel in lst_novels_Romantik_II_lemmatized for sent in novel]

In [11]:
len(lst_sents_Romantik_Kernkorpus_lemmatized)

105993

In [12]:
len(lst_sents_Romantik_I_lemmatized)

206064

In [13]:
len(lst_sents_Romantik_II_lemmatized)

201170

In [14]:
lst_sents_Romantik_Kernkorpus_tokenized = [sent for novel in lst_novels_Romantik_Kernkorpus_tokenized for sent in novel]

In [15]:
lst_sents_Romantik_I_tokenized = [sent for novel in lst_novels_Romantik_I_tokenized for sent in novel]

In [16]:
lst_sents_Romantik_II_tokenized = [sent for novel in lst_novels_Romantik_II_tokenized for sent in novel]

In [17]:
len(lst_sents_Romantik_Kernkorpus_tokenized)

105993

In [18]:
len(lst_sents_Romantik_I_tokenized)

206064

In [19]:
len(lst_sents_Romantik_II_tokenized)

201170

In [20]:
# merge corpus Romantik

lst_sents_Romantik_lemmatized = lst_sents_Romantik_Kernkorpus_lemmatized.copy()

In [21]:
lst_sents_Romantik_lemmatized.extend(lst_sents_Romantik_I_lemmatized)

In [22]:
lst_sents_Romantik_lemmatized.extend(lst_sents_Romantik_II_lemmatized)

In [23]:
len(lst_sents_Romantik_lemmatized)

513227

In [24]:
lst_sents_Romantik_lemmatized[0]

['ludwig',
 'achim',
 'von',
 'arnim',
 'armut',
 'reichtum',
 'schuld',
 'und',
 'buße',
 'der',
 'gräfin',
 'dolores',
 'ein',
 'wahr',
 'geschichte',
 'zu',
 'der',
 'lehrreich',
 'unterhaltung',
 'armer',
 'fräulein',
 'zueignung',
 'zueignung',
 'an',
 'der',
 'fürst',
 'radzivil',
 'durchlaucht']

In [25]:
lst_sents_Romantik_tokenized = lst_sents_Romantik_Kernkorpus_tokenized.copy()

In [26]:
lst_sents_Romantik_tokenized.extend(lst_sents_Romantik_I_tokenized)

In [27]:
lst_sents_Romantik_tokenized.extend(lst_sents_Romantik_II_tokenized)

In [28]:
len(lst_sents_Romantik_tokenized)

513227

In [29]:
lst_sents_Romantik_tokenized[0]

'Ludwig Achim von Arnim Armut, Reichtum, Schuld und Buße der Gräfin Dolores Eine wahre Geschichte zur lehrreichen Unterhaltung armer Fräulein Zueignung Zueignung an des Fürsten Radzivil Durchlaucht'

In [30]:
# create models

model_Romantik_lemmatized = Word2Vec(lst_sents_Romantik_lemmatized, vector_size=300, window=5,  min_count=5, workers=3, sg=1, epochs=5)

In [32]:
model_Romantik_lemmatized.wv.most_similar('mensch')

[('menschenkind', 0.6130830645561218),
 ('geschopf', 0.6063209176063538),
 ('kreatur', 0.5917930006980896),
 ('wüstling', 0.5732919573783875),
 ('rasse', 0.5711509585380554),
 ('menschlich', 0.5699660778045654),
 ('mittelstand', 0.5680449604988098),
 ('vernünftige', 0.564473569393158),
 ('seinesgleichen', 0.5536277890205383),
 ('subjekte', 0.5528882741928101)]

In [33]:
model_Romantik_lemmatized.wv.most_similar('tier')

[('thier', 0.6578449010848999),
 ('gewürm', 0.6217717528343201),
 ('wilde', 0.6214273571968079),
 ('thiere', 0.6194289922714233),
 ('vieh', 0.6182639598846436),
 ('aff', 0.6065509915351868),
 ('tiger', 0.5968114733695984),
 ('raubthier', 0.5846810936927795),
 ('bestium', 0.5677855014801025),
 ('hungrigen', 0.5629072785377502)]

In [34]:
# save models

model_Romantik_lemmatized.wv.save('../Data/models/240702_model_Romantik_lemmatized.kv')

In [35]:
# get word count

lst_words_Romantik_lemmatized = [word for sent in lst_sents_Romantik_lemmatized for word in sent]

In [37]:
lst_words_Romantik_lemmatized[:10]

['ludwig',
 'achim',
 'von',
 'arnim',
 'armut',
 'reichtum',
 'schuld',
 'und',
 'buße',
 'der']

In [38]:
len(lst_words_Romantik_lemmatized)

10702163

In [39]:
print(model_Romantik_lemmatized)

Word2Vec<vocab=44621, vector_size=300, alpha=0.025>
