# German relatio for narrative extraction

### 1. Prerequisites

#### Loading libraries 

In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import spacy
from tqdm import tqdm

from utils import split_into_sentences

In [2]:
nlp = spacy.load("de_core_news_lg")

#### Loading data

In [3]:
df = pd.read_feather("../data/bundestag_data.feather")


In [4]:
df = df.head()


### 2. Some data wrangling

In [None]:
# TBD for example data

### 3. Apply relatio

In [5]:
# split into sentences
split_sentences = split_into_sentences(df, text_col="speechContent",  
                                       progress_bar=True, method="nltk")


Splitting into sentences...


100%|██████████| 5/5 [00:00<00:00, 354.11it/s]


In [6]:
for i in range(10):
    print('Document id: %s' %split_sentences[0][i])
    print('Sentence: %s \n' %split_sentences[1][i])



Document id: 604683
Sentence: Frau Präsidentin! 

Document id: 604683
Sentence: Meine werten Kolleginnen und Kollegen! 

Document id: 604683
Sentence: Liebe Kolleginnen und Kollegen von der Union, Sie müssen unter einer
gigantischen Verdrängung leiden, wenn Sie hier von
„Willkür“, „Geldbeschaffung“ und „Ökozockerei“ reden. 

Document id: 604683
Sentence: ({0})

Ich möchte Sie nur an ein paar Fakten erinnern. 

Document id: 604683
Sentence: Ich
weiß, es tut manchmal weh, an Fakten erinnert zu werden, aber ich kann es Ihnen nicht ganz ersparen. 

Document id: 604683
Sentence: Ich beziehe mich einmal auf die Jahre seit 1989. Januar 1989:
Erhöhung der Mineralölsteuer um 9 Pfennig; 

({1})

Januar 1991: Erhöhung der Mineralölsteuer um
3 Pfennig; Juli 1991: Erhöhung der Mineralölsteuer um
22 Pfennig;

({2})

Januar 1994: Erhöhung der Mineralölsteuer um
16 Pfennig. 

Document id: 604683
Sentence: ({3})

Das alles war in Ihrer Regierungsverantwortung. 

Document id: 604683
Sentence: Das hat au

In [7]:
# checking stopwords
stopwords = nlp.Defaults.stop_words
print(sorted(stopwords))
print(type(stopwords))

['a', 'ab', 'aber', 'ach', 'acht', 'achte', 'achten', 'achter', 'achtes', 'ag', 'alle', 'allein', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemeinen', 'als', 'also', 'am', 'an', 'andere', 'anderem', 'anderen', 'andern', 'anders', 'auch', 'auf', 'aus', 'ausser', 'ausserdem', 'außer', 'außerdem', 'bald', 'bei', 'beide', 'beiden', 'beim', 'beispiel', 'bekannt', 'bereits', 'besonders', 'besser', 'besten', 'bin', 'bis', 'bisher', 'bist', 'da', 'dabei', 'dadurch', 'dafür', 'dagegen', 'daher', 'dahin', 'dahinter', 'damals', 'damit', 'danach', 'daneben', 'dank', 'dann', 'daran', 'darauf', 'daraus', 'darf', 'darfst', 'darin', 'darum', 'darunter', 'darüber', 'das', 'dasein', 'daselbst', 'dass', 'dasselbe', 'davon', 'davor', 'dazu', 'dazwischen', 'daß', 'dein', 'deine', 'deinem', 'deiner', 'dem', 'dementsprechend', 'demgegenüber', 'demgemäss', 'demgemäß', 'demselben', 'demzufolge', 'den', 'denen', 'denn', 'denselben', 'der', 'deren', 'derjenige', 'derjenigen', 'dermassen', 'dermaßen',

In [10]:
from utils import sentence_processing
sent_res = sentence_processing(split_sentences, stopwords)

50 of 134


In [11]:
print(sent_res)

[[604683, 1, 'Meine werten Kolleginnen und Kollegen!', [Meine, werten, Kolleginnen]], [604683, 4, 'Ich\nweiß, es tut manchmal weh, an Fakten erinnert zu werden, aber ich kann es Ihnen nicht ganz ersparen.', [Ich, weiß, tut]], [604683, 5, 'Ich beziehe mich einmal auf die Jahre seit 1989. Januar 1989:\nErhöhung der Mineralölsteuer um 9 Pfennig; \n\n({1})\n\nJanuar 1991: Erhöhung der Mineralölsteuer um\n3 Pfennig; Juli 1991: Erhöhung der Mineralölsteuer um\n22 Pfennig;\n\n({2})\n\nJanuar 1994: Erhöhung der Mineralölsteuer um\n16 Pfennig.', [Ich, beziehe, mich]], [604683, 11, 'Sie sagen, die Wettbewerbsfähigkeit sei gefährdet, wir seien nicht mehr konkurrenzfähig mit unseren Nachbarn.', [Sie, sagen, sei, seien]], [604683, 13, 'Der ADAC zum\nBeispiel - ich gebe Ihnen einen Anstoß - hat eine Liste\nder Preise für einen Liter Bleifrei Super herausgegeben,\nStand 7. Januar dieses Jahres: Niederlande: 2,10 DM;\nDänemark: 2,10 DM; Frankreich: 2,03 DM; Belgien:\n1,95 DM; Deutschland: 1,94 DM.', [

In [13]:
# Processing after extraction

doc_ids = []
for i in range(len(sent_res)):
    doc_ids.append(sent_res[i][0])

sent_ids = []
for i in range(len(sent_res)):
    sent_ids.append(sent_res[i][1])

sent_full = []
for i in range(len(sent_res)):
    sent_full.append(sent_res[i][2])

In [None]:
from utils import mine_entities
ents_subs = mine_entities(dat_sent["subs_processed"], ent_labels = ["LOC", "ORG", "PER"])
ents_obs = mine_entities(dat_sent["obs_processed"], ent_labels = ["LOC", "ORG", "PER"])

In [None]:
common_subs = ents_subs.most_common(30)
common_obs = ents_obs.most_common(30)

In [None]:
dat_sent = pd.DataFrame({"doc_ids": doc_ids, "sent_ids" : sent_ids, "sent_full" : sent_full, "narr_id": narr_id, "subs": narr_subs, 
                        "negs": narr_negs, "verbs": narr_verbs, "objects": narr_obs})


In [None]:
# # Keep top n Named Entities

common_subs = ents_subs.most_common(30)
common_obs = ents_obs.most_common(30)

common_subs

In [None]:
type(ents_subs)

In [None]:
# put counts together
all_ents = ents_subs + ents_obs


len(ents_subs)
len(ents_obs)
len(all_ents)

In [None]:
# keep those with more than 20 mentions (arbitrary number for now)

keep_ents = {key: value for key, value in all_ents.items() if value >= 20}
len(keep_ents)

str(keep_ents.keys)

# check whether sub, obj is in top entities
dat_sent["top_entity_subs"] = dat_sent['subs_processed'].apply(lambda x: 1 if x in keep_ents else 0)
dat_sent["top_entity_obs"] = dat_sent['obs_processed'].apply(lambda x: 1 if x in keep_ents else 0)

dat_sent["top_entity"] = dat_sent["top_entity_subs"] + dat_sent["top_entity_obs"]

dat_sent.head

In [None]:
# ## Save checkpoint: mined entities

#import pickle

#with open('ents_subs.pickle', 'wb') as outputfile:
#    pickle.dump(ents_subs, outputfile)

#with open('ents_obs.pickle', 'wb') as outputfile:
#    pickle.dump(ents_obs, outputfile)

#### Process roles without named entities

In [None]:
# find roles that are not in top n entities
non_ner_dat = dat_sent.loc[dat_sent['top_entity'] < 1]
non_ner_subs = dat_sent.loc[dat_sent['top_entity_subs'] < 1]
non_ner_obs = dat_sent.loc[dat_sent['top_entity_obs'] < 1]

In [None]:
dat_top_n = dat_sent.loc[dat_sent['top_entity'] > 0]
dat_top_n.head

In [None]:
non_ner_dat.shape

In [None]:
dat_sent.shape

#### Fine-tuning using FastText embeddings


In [None]:
# Advantages: even out-of-vocab words have representation, also character-level embedding helps with misspelled words. 

from gensim.models import FastText
from gensim.models.fasttext import load_facebook_model
from gensim.test.utils import datapath
from gensim import utils
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer


# load existing model
model = load_facebook_model("..\\03_data\\fasttext_model.bin")


# Preprocessing for fasttext embeddings
lem_nlp = spacy.load("de_core_news_lg", disable=['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'ner'])


# lemmatize
sentences = list(map(lambda x: ' '.join([w.lemma_ for w in lem_nlp(x)]), split_sentences[1]))

# lowercase
sentences = [tok.lower() for tok in sentences]
sentences[0]

In [None]:
# remove numbers
numbs = str.maketrans("", "", string.digits)
sentences = [tok.translate(numbs) for tok in sentences]

# remove punctuation
puncts = str.maketrans("", "", string.punctuation)
sentences = [tok.translate(puncts) for tok in sentences]

# strip whitespace
sentences = [tok.strip() for tok in sentences]

In [None]:
# tokenize
tokenizer = RegexpTokenizer(r'\w+')
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences]
sentences_tokenized[1]

In [None]:
# fine-tune training 
model.build_vocab(sentences_tokenized, update=True)

model.epochs
type(model)

In [None]:
# some code to get logging info
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        # note that gensim fasttext implementation currently has not implemented to get loss 
        #loss = model.get_latest_training_loss()
        #print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

In [None]:
# training
model.train(sentences_tokenized, total_examples=len(sentences_tokenized), epochs=model.epochs, callbacks = [callback()])

In [None]:
# ## Get relevant vectors
subs_vecs = model.wv[dat_sent["subs_processed"]]
obs_vecs = model.wv[dat_sent["obs_processed"]]

In [None]:
subs_vecs.shape

In [None]:
test_dic = dict.fromkeys(dat_sent["subs_processed"], subs_vecs)

subs_vecs[0]

#### KMeans clustering

In [16]:
# load from utils for clustering
from utils import get_vector
from utils import get_vectors
from utils import train_cluster_model
from utils import get_clusters
from utils import label_clusters_most_freq
from utils import label_clusters_most_similar

In [None]:
test_km_training = train_cluster_model(subs_vecs, n_clusters = 200, verbose = 1)

In [None]:
test_km_training.cluster_centers_

In [None]:
# label clusters
from utils import label_clusters_most_similar

In [None]:
clusts = test_km_training.predict(subs_vecs)

In [None]:
test = label_clusters_most_similar(test_km_training, model)

In [None]:
test

### 4. Final Narratives


In [None]:
# all top n narratives: 
dat_top_n = dat_sent.loc[dat_sent['top_entity'] > 1]
dat_top_n.head

In [None]:
# get all entities not in top ner
non_ner_subs = dat_sent.loc[dat_sent['top_entity_subs'] < 1]["subs_processed"].tolist()
non_ner_obs = dat_sent.loc[dat_sent['top_entity_obs'] < 1]["obs_processed"].tolist()

In [None]:
non_ner_roles = non_ner_subs + non_ner_obs
non_ner_vecs = model.wv[non_ner_roles]

In [None]:
non_ner_vecs.shape

#### KMeans clustering for correct vectors

In [19]:
# set n clusters
n_clust = 200

In [None]:
test_km_training = train_cluster_model(non_ner_vecs, n_clusters = n_clust, verbose = 1)

In [None]:
from utils import label_clusters_most_similar
clusts = test_km_training.predict(non_ner_vecs)

In [None]:
len(clusts)

In [None]:
len(non_ner_vecs)

In [None]:
non_ner_df = pd.DataFrame({"role" : non_ner_roles, "clust_nr" : clusts.tolist()}, index=range(len(non_ner_vecs)))

In [None]:
non_ner_df.sample(20)

In [None]:
non_ner_df.shape

In [None]:
clust_labs = label_clusters_most_similar(test_km_training, model)

In [None]:
clust_labs_df = pd.DataFrame.from_dict(clust_labs, orient = "index")

In [None]:
clust_labs_df.columns = ["role", "vec_value"]
clust_labs_df["clust_nr"] = clust_labs_df.index
clust_labs_df.head()

In [None]:
non_ner_df = non_ner_df.merge(clust_labs_df, on = 'clust_nr', how = "left")
non_ner_df.columns = ["role_unclust", "clust_nr", "role_clust", "vec_value"]
non_ner_df.head()

In [None]:
non_ner_df.shape

In [None]:
# drop duplicates
non_ner_df = non_ner_df.drop_duplicates(subset = ["role_unclust"])

In [None]:
non_ner_df.shape

In [None]:
top_subs = dat_sent.loc[dat_sent['top_entity_subs'] == 1]["obs_processed"]
top_obs = dat_sent.loc[dat_sent['top_entity_obs'] == 1]["obs_processed"]


In [None]:
# merge into full dat_sent dataframe
dat_sent.shape

In [None]:
# ### merge clustered subs
fin_dat = dat_sent.merge(non_ner_df, left_on="subs_processed", right_on = "role_unclust",
                         how = "left")


fin_dat = fin_dat.rename(columns = {"role_clust" : "subs_clust", "role_unclust" : "subs_unclust"})

In [None]:
# ### merge clustered obs
fin_dat = fin_dat.merge(non_ner_df, left_on="obs_processed", right_on = "role_unclust",
                         how = "left")


fin_dat = fin_dat.rename(columns = {"role_clust" : "obs_clust", "role_unclust" : "obs_unclust"})

In [None]:
print(fin_dat.shape)

In [None]:
fin_dat.head(100)

In [None]:
# save output
# fin_dat.to_csv("final_python_processed.csv")