In [26]:
# Paths to the trained Word Embedding Models
META_PATH_WEM = {
    "1990": "model/1990.model",
    "2000": "model/2000.model",
    "2010": "model/2010.model",
    "CDU": "model/CDU.model",
    "SPD": "model/SPD.model",
    "FDP": "model/FDP.model",
    "GRUENE": "model/GRUENE.model",
    "LINKE": "model/LINKE.model",
    "compass": "model/compass.model",
}
# Provide paths to trained k-Means models or set "train_new_models" to True.
# Set k to an integer of your choice to specify the number of clustered to be generated. Alternatively, choose a decimal number to orient k according to the number of types/embeddings (k=0.024 corresponds to 2.4 % of the types)
KMEANS_PAR = {
    "train_new_models": True,
    "k": 0.024,
    "meta_path": {
        "1990": "PATH/model/1990.model.kmeans.pkl",
        "2000": "PATH/model/2000.model.kmeans.pkl",
        "2010": "PATH/model/2010.model.kmeans.pkl",
        "CDU": "PATH/model/CDU.model.kmeans.pkl",
        "SPD": "PATH/model/SPD.model.kmeans.pkl",
        "FDP": "PATH/model/FDP.model.kmeans.pkl",
        "GRUENE": "PATH/model/GRUENE.model.kmeans.pkl",
        "LINKE": "PATH/model/LINKE.model.kmeans.pkl",
        "compass": "PATH/model/compass.model.kmeans.pkl",
    },
}
# Specify a list of thematic seed words that will be used (and significantly expanded) to determin a thematic centroid as a reference point in the models
T_CENTROID_PAR = {
    "seed_words": [
        "Arbeit",
        "Arbeiten",
        "arbeiten",
        "gearbeitet",
        "arbeitet",
        "Tätigkeit",
        "Tätigkeiten",
        "Lohn",
        "Gehalt",
        "schuften",
        "geschuftet",
    ],
    "t_sim": 0.29,
    "neighbors": 500,
    "cutoff": 0.05,
    "write_to_disc": True,
}

# Path to the vrt file of your CWB corpus
CORPUS_VRT_PATH = "germaparl_decades.vrt"

# in s_attribute_meta you need to map the s_attributes (as specified when importing the vrt file into the CWB) to the metadata used above
# you can, optionally, use shorthands for the metadata when labeling the clusters
# column specifies which column of the vrt file is the token base layer
# if you set keep_layers to True, your CWB corpus will contain both, its old annotations and the cluster annotations
# in ignore_meta you can specify values of s_attributes in your CWB corpus that you don't want to add cluster annotations for (or where you don't have clusters)
# using replace_meta, you can map s_attribute values to other s_attribute values to treat them as if they were the same when annotating the corpus. For example, {"a": "b"} would annotate every word of a text with the s_attribute "a" with the cluster trained on the "b" model.
ANNOTATION_PAR = {
    "s_attribute_meta": {
        "party": [
            "CDU",
            "SPD",
            "FDP",
            "GRUENE",
            "LINKE",
        ],
        "decade": ["1990", "2000", "2010"],
    },
    "shorthand": {"1990": "90", "2000": "00", "2010": "10"},
    "column": 3,
    "keep_layers": False,
    "ignore_meta": {"parteilos", ""},
    "replace_meta": {"CSU": "CDU", "PDS": "LINKE"},
}

As we want to train our models by decade (and by party), we enrich the corpus with this information.

In [5]:
with open("PATH/germaparl.vrt", "r", encoding="latin1") as f:
    vrt = f.readlines()

vrt_out = []
for line in vrt:
    if line.startswith("<year"):
        year = int(line.strip("<>\n").split(" ")[1])
        if year < 2000:
            vrt_out.append("<decade 1990>\n")
        elif year < 2010:
            vrt_out.append("<decade 2000>\n")
        elif year >= 2010:
            vrt_out.append("<decade 2010>\n")
    vrt_out.append(line)
    if line.startswith("</year>"):
        vrt_out.append("</decade>\n")

with open("germaparl_decades.vrt", "w") as f:
    f.write("".join(vrt_out))

We can now train our models, if we don't have any yet.

In [6]:
with open("germaparl_decades.vrt", "r") as f:
    vrt = f.readlines()

meta_sents = {
    "1990": [],
    "2000": [],
    "2010": [],
    "CDU": [],
    "SPD": [],
    "FDP": [],
    "GRUENE": [],
    "LINKE": [],
    "": [],
    "parteilos": [],
    "all": [],
}
# We treat CDU and CSU, PDS and LINKE for the training as if they were the same party
for line in vrt:
    if line.startswith("<party"):
        party = line.strip("<>\n").split(" ")[1]
        if party == "CSU":
            party = "CDU"
        elif party == "fraktionslos":
            party = "parteilos"
        elif party == "PDS":
            party = "LINKE"
        sent = []
    if line.startswith("<decade"):
        decade = line.strip("<>\n").split(" ")[1]
    if "\t" in line:
        sent.append(line.strip().split("\t")[-1])
    if "</year" in line and (sent[0] != "(" and sent[-1] != ")") and len(sent) > 3:
        meta_sents[party].append(sent)
        meta_sents[decade].append(sent)
        meta_sents["all"].append(sent)

A quick and dirty helper function in order to get frequency dictionaries for every subcorpus (decades and parties)

In [7]:
from collections import Counter


def get_freq_dicts():

    meta_dict = {}

    for decade in ["1990", "2000", "2010"]:
        words = [word for sent in meta_sents[decade] for word in sent]
        meta_dict[decade] = Counter(words)

    for party in ["CDU", "SPD", "FDP", "GRUENE", "LINKE", "", "parteilos"]:
        words = [word for sent in meta_sents[party] for word in sent]
        meta_dict[party] = Counter(words)

    words = [word for sent in meta_sents["all"] for word in sent]
    meta_dict["all"] = Counter(words)

    return meta_dict


meta_freqs = get_freq_dicts()

Now, we can train the TWEC models

In [8]:
from twec.twec import TWEC

aligner = TWEC(size=200, min_count=1, window=6, workers=10)

min_occ = 15
path = "model/compass.txt"
with open(path, "w") as f:
    for sentence in meta_sents["all"]:
        sentence = [word for word in sentence if meta_freqs["all"][word] >= min_occ]
        f.write(" ".join(sentence) + "\n")
aligner.train_compass(path, overwrite=True)

for meta, sents in meta_sents.items():
    if meta == "all":
        continue
    else:
        path = f"model/{meta}.txt"
        with open(path, "w") as f:
            for sentence in meta_sents["all"]:
                sentence = [
                    word for word in sentence if meta_freqs[meta][word] >= min_occ
                ]
                f.write(" ".join(sentence) + "\n")
    aligner.train_slice(path, save=True)

Training the compass.
Training temporal embeddings: slice model/1990.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/2000.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/2010.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/CDU.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/SPD.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/FDP.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/GRUENE.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/LINKE.txt.
Initializing temporal embeddings from the atemporal compass.
Training temporal embeddings: slice model/.txt.
Initializing temporal embe

Now, enter the paths to the trained models in the parameter section above.
[If you would want to use your own models, you would start here.]

In [10]:
from gensim.models import Word2Vec

meta_model = {}
for meta, path in META_PATH_WEM.items():
    meta_model[meta] = Word2Vec.load(path).wv

Let's check if the models have  been loaded correctly:

In [11]:
meta = "SPD"
meta_model[meta].most_similar("Arbeit")

[('Tätigkeit', 0.6047515869140625),
 ('Vorarbeit', 0.5334701538085938),
 ('Arbeiten', 0.49987655878067017),
 ('Ausbildung', 0.4991936981678009),
 ('Gesetzgebungsarbeit', 0.4801711440086365),
 ('Aufklärungsarbeit', 0.4769136309623718),
 ('Beschäftigung', 0.47630828619003296),
 ('Zuarbeit', 0.4755493402481079),
 ('Hilfe', 0.4712493419647217),
 ('Immobilienaufgaben', 0.464999258518219)]

In [12]:
meta = "FDP"
meta_model[meta].most_similar("arbeiten")

[('weiterarbeiten', 0.6369683742523193),
 ('leben', 0.5952897071838379),
 ('gearbeitet', 0.5807672142982483),
 ('mitarbeiten', 0.5708843469619751),
 ('arbeitet', 0.5700968503952026),
 ('mitwirken', 0.5669143795967102),
 ('forschen', 0.5174206495285034),
 ('zusammenarbeiten', 0.5101714730262756),
 ('kämpfen', 0.49115467071533203),
 ('denken', 0.48813557624816895)]

In [13]:
meta = "GRUENE"
meta_model[meta].most_similar("Gehalt")

[('Taschengeld', 0.6789494752883911),
 ('Lohn', 0.6732443571090698),
 ('Erwerbseinkommen', 0.6674096584320068),
 ('Honorar', 0.6258935928344727),
 ('Nettoeinkommen', 0.6141149401664734),
 ('Wohnort', 0.6094042062759399),
 ('Lebensalter', 0.6080386638641357),
 ('Entgelt', 0.6050905585289001),
 ('Fahrzeug', 0.6004776954650879),
 ('Einkommen', 0.597220778465271)]

Your k-Means clusters will be trained or loaded:

In [14]:
import pickle
from lib import train_kMeans

if KMEANS_PAR["train_new_models"]:
    meta_km = train_kMeans(meta_model, META_PATH_WEM, KMEANS_PAR["k"])
else:
    meta_km = {}
    for meta, path in KMEANS_PAR["meta_path"].items():
        with open(path, "rb") as f:
            meta_km[meta] = pickle.load(f)

Starting to cluster model 1990: 14:40:34
Finished clustering model 1990: 14:40:51
Starting to cluster model 2000: 14:40:51
Finished clustering model 2000: 14:41:45
Starting to cluster model 2010: 14:41:45
Finished clustering model 2010: 14:42:29
Starting to cluster model CDU: 14:42:29
Finished clustering model CDU: 14:43:03
Starting to cluster model SPD: 14:43:03
Finished clustering model SPD: 14:43:31
Starting to cluster model FDP: 14:43:31
Finished clustering model FDP: 14:43:40
Starting to cluster model GRUENE: 14:43:40
Finished clustering model GRUENE: 14:43:52
Starting to cluster model LINKE: 14:43:52
Finished clustering model LINKE: 14:44:01
Starting to cluster model compass: 14:44:01
Finished clustering model compass: 14:46:07


Every cluster will be sorted by closeness to the centroid and named after the three most central words:

In [15]:
import pandas as pd
from scipy.spatial import distance

meta_word_cluster = {}
meta_cluster_words = {}
meta_cluster_centroid = {}
meta_cluster_label = {}

for meta, km in meta_km.items():
    model = meta_model[meta]
    try:
        word_cluster = dict(zip(model.key_to_index.keys(), km.labels_))
    except AttributeError:
        word_cluster = dict(zip(model.index2word, km.labels_))
    cluster_centroid = dict(zip(range(len(km.cluster_centers_)), km.cluster_centers_))
    df = pd.DataFrame(word_cluster.items(), columns=["word", "cluster"])
    df["sim"] = [
        1 - distance.cosine(model[word], cluster_centroid[word_cluster[word]])
        for word in df["word"]
    ]
    df = df.sort_values(by=["cluster", "sim"], ascending=[True, False])

    cluster_words = {}

    for i, row in df.iterrows():
        if row["cluster"] in cluster_words:
            cluster_words[row["cluster"]].append(row["word"])
        else:
            cluster_words[row["cluster"]] = [row["word"]]

    cluster_label = {}

    for cluster, words in cluster_words.items():
        cluster_label[cluster] = "|".join(words[:3])

    meta_word_cluster[meta] = word_cluster
    meta_cluster_words[meta] = cluster_words
    meta_cluster_centroid[meta] = cluster_centroid
    meta_cluster_label[meta] = cluster_label

A couple of example clusters:

In [16]:
meta = "CDU"
meta_cluster_words[meta][meta_word_cluster[meta]["arbeiten"]]

['mitwirken',
 'teilzunehmen',
 'mitzuwirken',
 'teilnehmen',
 'gehindert',
 'hindern',
 'mitarbeiten',
 'mitgewirkt',
 'mitgearbeitet',
 'teilhaben',
 'teilgenommen',
 'hindert',
 'gelegen',
 'scheitern',
 'Teilnahme',
 'arbeiten']

In [17]:
meta = "LINKE"
meta_cluster_words[meta][meta_word_cluster[meta]["Reichtum"]]

['Ungleichheit',
 'Ungerechtigkeit',
 'Ungleichheiten',
 'Kälte',
 'Spaltung',
 'Ausgrenzung',
 'Schieflage',
 'Härten',
 'Spannungen',
 'Verwerfungen',
 'Wohnraumförderung',
 'Sicherungssysteme',
 'Ungerechtigkeiten',
 'Sicherungssystemen',
 'Gerechtigkeit',
 'Armut',
 'Massenarbeitslosigkeit',
 'Kluft',
 'Marktwirtschaft',
 'Kinderarmut',
 'Komponente',
 'Altersarmut',
 'Reichtum',
 'Schere',
 'Minderausgabe']

In [18]:
meta = "FDP"
meta_cluster_words[meta][meta_word_cluster[meta]["Geld"]]

['Eigenkapital',
 'Geld',
 'Kapital',
 'Gelder',
 'Investitionen',
 'Kredite',
 'Mittel',
 'Personal',
 'Einkommen',
 'Aufträge',
 'Vermögen',
 'Wachstum']

Here, we calculate the thematic centroid that helps us to identify clusters that are relevant to our discourse:

In [19]:
from lib import get_t_centroid_clusters

meta_t_clusters = {}
meta_t_centroid = {}
meta_t_df = {}

for meta in META_PATH_WEM.keys():
    meta_t_clusters[meta], meta_t_centroid[meta], meta_t_df[meta] = (
        get_t_centroid_clusters(
            meta,
            meta_model,
            meta_word_cluster,
            meta_cluster_words,
            meta_cluster_centroid,
            meta_cluster_label,
            T_CENTROID_PAR["seed_words"],
            T_CENTROID_PAR["t_sim"],
            T_CENTROID_PAR["neighbors"],
            T_CENTROID_PAR["cutoff"],
            ANNOTATION_PAR["shorthand"],
            T_CENTROID_PAR["write_to_disc"],
        )
    )

The seed word schuften is not contained in model 1990!
The seed word geschuftet is not contained in model 1990!
The seed word schuften is not contained in model 2000!
The seed word geschuftet is not contained in model 2000!
The seed word geschuftet is not contained in model 2010!
The seed word schuften is not contained in model CDU!
The seed word geschuftet is not contained in model CDU!
The seed word geschuftet is not contained in model SPD!
The seed word schuften is not contained in model FDP!
The seed word geschuftet is not contained in model FDP!
The seed word schuften is not contained in model GRUENE!
The seed word geschuftet is not contained in model GRUENE!
The seed word geschuftet is not contained in model LINKE!


Now we can take a look at the clusters of the SPD subcorpus that are closest and most distant to the thematic centroid:

In [20]:
meta_t_df["SPD"]

Unnamed: 0,label,cluster,n_words,thematic_similarity
320,SPD_Lebenschancen|Zukunftschancen|Bildungschancen,320,20,0.638956
102,SPD_Finanzmittel|Mittel|Haushaltsmittel,102,15,0.548024
438,SPD_Chance|Möglichkeit|Chancen,438,4,0.547225
613,SPD_gewährleisten|garantieren|sichern,613,12,0.544840
54,SPD_Kompetenzen|Kompetenz|Aufgaben,54,8,0.518926
...,...,...,...,...
191,SPD_Jetzt|Nun|Dann,191,5,-0.233925
284,SPD_Diesen|Den,284,2,-0.236208
512,SPD_Als,512,1,-0.241145
243,SPD_Jedenfalls|Zumindest|Letztendlich,243,144,-0.264882


And in comparison, the clusters of the FDP subcorpus:

In [21]:
meta_t_df["FDP"]

Unnamed: 0,label,cluster,n_words,thematic_similarity
273,FDP_investieren|sparen|ausgeben,273,19,0.597316
98,FDP_realisieren|verwirklichen|organisieren,98,55,0.588040
76,FDP_mitreden|aushalten|überleben,76,96,0.551479
209,FDP_eingeleitet|durchgeführt|geleistet,209,29,0.548262
57,FDP_stärken|sichern|fördern,57,10,0.547577
...,...,...,...,...
21,FDP_worden|wurde,21,2,-0.228741
85,FDP_hatte|hat|habe,85,4,-0.263587
292,FDP_Dieser|Der|Diesen,292,3,-0.263767
64,FDP_Immerhin|Jedenfalls|Letztendlich,64,145,-0.278808


Now, we can annotate every token of our CWB corpus with the generated clusters:

In [27]:
with open(CORPUS_VRT_PATH, "r") as f:
    with open(CORPUS_VRT_PATH.replace(".vrt", "_clusters.vrt"), "w") as f2:
        s_attributes = {}
        while line := f.readline():
            if line.startswith("<") and " " in line:
                attribute = line.strip("<>\n").split(" ")[0]
                value = " ".join(line.strip("<>\n").split(" ")[1:])
                s_attributes[attribute] = (
                    value
                    if value not in ANNOTATION_PAR["replace_meta"]
                    else ANNOTATION_PAR["replace_meta"][value]
                )

            if "\t" in line:
                word = line.strip().split("\t")[ANNOTATION_PAR["column"] - 1]
                annotations = []
                for s_attribute, metadata in ANNOTATION_PAR["s_attribute_meta"].items():
                    for meta in metadata:
                        if meta == s_attributes[s_attribute]:
                            if meta in ANNOTATION_PAR["ignore_meta"]:
                                annotations.append("-")
                                continue
                            label = (
                                meta_cluster_label[meta][meta_word_cluster[meta][word]]
                                if word in meta_word_cluster[meta]
                                else "-"
                            )
                            if label == "-":
                                annotations.append(label)
                                continue
                            annotation = (
                                ANNOTATION_PAR["shorthand"][meta] + "_" + label
                                if meta in ANNOTATION_PAR["shorthand"]
                                else meta + "_" + label
                            )
                            annotations.append(annotation)
                        else:
                            continue
                if ANNOTATION_PAR["keep_layers"]:
                    f2.write(line.rstrip() + "\t" + "\t".join(annotations) + "\n")
                else:
                    f2.write(word + "\t" + "\t".join(annotations) + "\n")
            else:
                f2.write(line)

Finally, we save a JSON file containing a mapping of the metadata/subcorpus to the contained clusters to the words contained in the clusters

In [28]:
import json

with open("output/meta_cluster_words.json", "w") as f:
    json.dump(meta_cluster_words, f)