# Towards a Conflict Heuristic (DH 2023)

## 03. Create Models

Last updated: 16.01.2023

julian.haeussler[at]tu-darmstadt.de

In [1]:
# imports

from gensim.models import Word2Vec
import pickle

In [2]:
# paths

path_data = r'C:\Users\Public\Data\conflict_heuristics\pickled'

path_results = r'C:\Users\Public\Data\conflict_heuristics\models'

In [3]:
# read in corpora

with open(path_data + '\\230116_lst_lists_phrases_Romantik_LEMMATIZED.pkl', 'rb') as f:
    lst_lists_phrases_Romantik = pickle.load(f)

In [4]:
with open(path_data + '\\230116_lst_lists_phrases_Realismus_LEMMATIZED.pkl', 'rb') as f:
    lst_lists_phrases_Realismus = pickle.load(f)

In [5]:
with open(path_data + '\\230116_lst_lists_phrases_Naturalismus_LEMMATIZED.pkl', 'rb') as f:
    lst_lists_phrases_Naturalismus = pickle.load(f)

In [6]:
# create lists of phrases

lst_phrases_Romantik = [phrase for novel in lst_lists_phrases_Romantik for phrase in novel]

In [7]:
len(lst_phrases_Romantik)

1104199

In [8]:
lst_phrases_Romantik[:5]

[['ein',
  'lieb',
  'mädchen',
  'sollen',
  'du',
  'heute',
  'kennen',
  'lernen',
  'alexander'],
 ['alexander',
  'sagen',
  'der',
  'verwitwete',
  'generalin',
  'von',
  'zwenkau',
  'zu',
  'mein',
  'neffe',
  'einer',
  'zwanzigjährigen',
  'husarenoffizier'],
 ['der',
  'aus',
  'der',
  'residenz',
  'kommen',
  'sein',
  'ich',
  'in',
  'der',
  'landstadt',
  'zu',
  'besuchen'],
 ['in', 'der', 'ich', 'wohnen'],
 ['der', 'sein', 'einen', 'parthie', 'für', 'sich']]

In [9]:
lst_phrases_Realismus = [phrase for novel in lst_lists_phrases_Realismus for phrase in novel]

In [10]:
lst_phrases_Naturalismus = [phrase for novel in lst_lists_phrases_Naturalismus for phrase in novel]

In [11]:
len(lst_phrases_Realismus)

452321

In [12]:
len(lst_phrases_Naturalismus)

459557

In [15]:
# combine Realismus and Naturalismus

lst_phrases_RealismusNaturalismus = lst_phrases_Realismus.copy()

In [16]:
lst_phrases_RealismusNaturalismus.extend(lst_phrases_Naturalismus)

In [17]:
len(lst_phrases_RealismusNaturalismus)

911878

In [18]:
# create models

# parameters: nach best practice Brottrager et al. 2022 (architecture = Skip-gram, vector size = 300, epochs = 10) 
#             + min_count = 1 (!)

model_Romantik = Word2Vec(lst_phrases_Romantik, vector_size=300, window=5, workers=6, min_count=1, sg=1, epochs=10)

In [19]:
model_RealismusNaturalismus = Word2Vec(lst_phrases_RealismusNaturalismus, vector_size=300, window=5, workers=6, min_count=1, sg=1, epochs=10)

In [20]:
# save

model_Romantik.wv.save(path_results + '\\230116_model_Romantik.kv')

In [21]:
model_RealismusNaturalismus.wv.save(path_results + '\\230116_model_RealismusNaturalismus.kv')

In [22]:
# model info

model_Romantik.wv.most_similar("natur")

[('schaffend', 0.507726788520813),
 ('schöpferische', 0.5025715231895447),
 ('blütenwelt', 0.4898774027824402),
 ('eingeboren', 0.4865909218788147),
 ('allbeseelten', 0.48657113313674927),
 ('eigentlichste', 0.4839968979358673),
 ('armillarsphäre', 0.4829738736152649),
 ('modulation', 0.4807109236717224),
 ('weltleben', 0.4790152311325073),
 ('pflanzenwelt', 0.47742193937301636)]

In [23]:
model_RealismusNaturalismus.wv.most_similar("natur")

[('schwankung', 0.508905291557312),
 ('bedingt', 0.5060728788375854),
 ('vorurteilsfreiere', 0.5013084411621094),
 ('unnatur', 0.5012270212173462),
 ('einbildungskraft', 0.4987388551235199),
 ('gewöhnung', 0.4922521710395813),
 ('lebensauffassung', 0.49219468235969543),
 ('heilkraft', 0.48951807618141174),
 ('anreiz', 0.48244231939315796),
 ('selbstverleugnung', 0.48183292150497437)]

In [24]:
print(model_Romantik)

Word2Vec(vocab=198134, vector_size=300, alpha=0.025)


In [25]:
print(model_RealismusNaturalismus)

Word2Vec(vocab=195097, vector_size=300, alpha=0.025)
