# Towards a Conflict Heuristic (DH 2023)

## 01. Preprocessing

Last updated: 11.01.2023
julian.haeussler[at]tu-darmstadt.de

In [None]:
# imports

import glob
import json
import os
import pickle
import re
import spacy
import string

!python -m spacy download de_core_news_lg
nlp = spacy.load('de_core_news_lg',exclude=['ner'],disable=['tagger','parser'])

In [2]:
path_data = r'C:\Users\Public\Data\conflict_heuristics'

In [3]:
path_results = r'C:\Users\Public\Data\conflict_heuristics\pickled\all'

In [4]:
# define functions

def extract_vp(title,corpus):
    path = path_data+'\\Output_'+corpus+'\\'+corpus
    
    # read json file and extract text+spans
    with open(path + "\\" + title + ".json", 'r') as f:
        json_data = json.load(f)
        text = json_data[0]['text']
        annotations = json_data[0]['annotations']

    # create list of verb phrases
    phrases = []
    for i in range(0,len(annotations)):
        start = annotations[i]['start']
        end = annotations[i]['end']
        phrases.append(text[start:end])
            
    return phrases

In [5]:
def cleaning(phrases):
    phrases_clean = []
    # remove escape characters
    for phrase in phrases:
        phrases_clean.append(phrase.replace('\n', ' '))
    return phrases_clean

In [6]:
def remove_punctuation(phrases_clean):
    phrases_nopunct = []
    s = string.punctuation + '–' +'»' + '«'
    # remove punctuation
    for phrase in phrases_clean:
        phrases_nopunct.append(phrase.translate(str.maketrans('', '', s)))
    return phrases_nopunct

In [7]:
def run_spacy(phrases_nopunct):
    phrases_lemmatized = [0]*len(phrases_nopunct)
    phrases_tokenized = [0]*len(phrases_nopunct)
    # run spacy and extract lemmas+tokens
    for i in range(0, len(phrases_nopunct)):
        words = nlp(phrases_nopunct[i])
        interim_lem = [0]*len(words)
        interim_tok = [0]*len(words)
        for j in range(0, len(interim_lem)):
            interim_lem[j] = words[j].lemma_
            interim_tok[j] = words[j].text
        phrases_lemmatized[i] = interim_lem
        phrases_tokenized[i] = interim_tok
    
    return phrases_lemmatized, phrases_tokenized

In [8]:
def filter_len(phrases):
    phrases_filtered = []
    for i in range(0,len(phrases)):
        if len(phrases[i]) > 2:
            phrases_filtered.append(phrases[i])
    return phrases_filtered

In [9]:
def lowercasing(phrases):
    phrases_lower = [[word.lower() for word in phrase] for phrase in phrases]
    return phrases_lower

In [10]:
def list_of_words(phrases):
    words = [word for phrase in phrases for word in phrase]
    return words

In [11]:
def save(file,name):
    with open(path_results + '\\' + name + '.pkl', 'wb') as f:
        pickle.dump(file, f)

In [16]:
def preprocessing(title,corpus):
    phrases = extract_vp(title,corpus)
    phrases_cleaned = cleaning(phrases)
    
    phrases_nopunct = remove_punctuation(phrases_cleaned)
    phrases_lemmatized, phrases_tokenized = run_spacy(phrases_nopunct)
    
    phrases_lemmatized_filtered = filter_len(phrases_lemmatized)
    phrases_tokenized_filtered = filter_len(phrases_tokenized)
    
    phrases_lemmatized_final = lowercasing(phrases_lemmatized_filtered)
    phrases_tokenized_final = lowercasing(phrases_tokenized_filtered)
    
    words_lemmatized_final = list_of_words(phrases_lemmatized_final)
    words_tokenized_final = list_of_words(phrases_tokenized_final)
    
    save(phrases_lemmatized_final,title+"_phrases_lemmatized_final")
    save(phrases_tokenized_final,title+"_phrases_tokenized_final")
    save(words_lemmatized_final,title+"_words_lemmatized_final")
    save(words_tokenized_final,title+"_words_tokenized_final")

In [14]:
# process Romantik 

lst_files = glob.glob(os.path.join(os.getcwd(), path_data+'\\Output_Romantik\\Romantik', "*.json"))

titles = []

for entry in lst_files:
    titles.append(re.search(r"(?<=Output_Romantik\\Romantik\\)(.*)(?=.json)",entry).group(1))

In [17]:
for i in range(0,len(titles)):
    preprocessing(titles[i],"Romantik")

In [24]:
# process Realismus

lst_files = glob.glob(os.path.join(os.getcwd(), path_data+'\\Output_Realismus\\Realismus', "*.json"))

titles = []

for entry in lst_files:
    titles.append(re.search(r"(?<=Output_Realismus\\Realismus\\)(.*)(?=.json)",entry).group(1))

In [25]:
for i in range(0,len(titles)):
    preprocessing(titles[i],"Realismus")

In [26]:
# process Naturalismus

lst_files = glob.glob(os.path.join(os.getcwd(), path_data+'\\Output_Naturalismus\\Naturalismus', "*.json"))

titles = []

for entry in lst_files:
    titles.append(re.search(r"(?<=Output_Naturalismus\\Naturalismus\\)(.*)(?=.json)",entry).group(1))

In [27]:
for i in range(0,len(titles)):
    preprocessing(titles[i],"Naturalismus")