In [144]:
# inputpath anpassen
datadir = "/home/chris/data/dlina/prose/fontane_output_norm/"
metadatadir = "/home/chris/data/dlina/prose/entitynames/"
excludedir = "/home/chris/data/dlina/prose/exclude/"

In [113]:
import os
from collections import Counter
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [114]:
rawfiles = os.listdir(datadir)
rawfiles

['fontane_stine_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_effi-briest_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_ellernklipp_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_der-stechlin_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_ladultera_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_die-poggenpuhls_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_mathilde-moehring_ohne-paratexte_absaetze-normalisiert.txt.jtf',
 'fontane_frau-jenny-treibl_ohne-paratexte_absaetze-normalisiert.txt.jtf']

### 1) Einlesen von Dateien im CoNLL tab-Format, wie es die Kallimachos-Engine ausgibt (https://gitlab2.informatik.uni-wuerzburg.de/kallimachos/KallimachosEngines)

In [163]:
def load_raw(path):
    with open(path, "r") as infile:
        doc = infile.read()
    splitlines = doc.split("\n")
    header = splitlines[0].split("\t")
    splitlines = [l.split("\t")[0:15] for l in splitlines]
    splitlines = [l for l in splitlines if l != []]
    df = pd.DataFrame(splitlines[1:])
    df.columns = header
    df.set_index(df.columns[:4].tolist(), inplace=True)
    df.fillna("", inplace=True)
    df.index = df.index.droplevel(0)
    df.sort_index(level=['ParagraphId','SentenceId', 'TokenId'], ascending=[1, 1, 1], inplace=True)
    return df

def load_excludes(path):
    pass

def load_aliases(path):
    pass

### 2) Ausgabe aller Named Entitys (aufsummiert nach Häufigkeit) im txt-Format. Diese Datei dient der manuelle Korrektur der Netzwerkdaten. 

### Hier wird die entsprechende Funktion definiert die später auf jeden Roman angewendet wird. Übergeben wird der DataFrame sowie der Name des Romans, um das Ergebnis zu speichern.

In [192]:
def get_entity_counts(df, name):
    # first convert to Token column to a list and apply a Counter object to it
    NE_counts = Counter(df["Token"].tolist())
    # next we count the entities, sort by most common appearance
    # and transform it into a new dataframe with name and count columns
    NE_counts = pd.DataFrame(NE_counts.most_common(), columns=["NE_name", "count"])
    # last, we save the counts to a csv
    NE_counts.to_csv(os.path.join("results", name+"_entitycounts.csv"))
    COREF_counts = df.groupby('CorefId')['Token'].apply(lambda x: Counter(x))
    COREF_counts.to_csv(os.path.join("results", "_".join([name, "coref_entitycounts.csv"])),
                        index_label = ["CorefId", "NE_name"], header = ["counts"])
    
def get_entity_mapper(df):
    mapper = df.groupby('CorefId')['Token'].apply(lambda x: Counter(x).most_common(1)[0][0])
    mapper = pd.DataFrame(mapper).reset_index()
    mapper['nodename'] = mapper.apply(lambda x: "-".join([x['CorefId'], x['Token']]), axis=1)
    mapper.set_index("CorefId", inplace=True)
    return mapper

### 3) Ausgabe von Relationsliste (d.i. Netzwerkdaten resp. Netzwerke) der Form „Akteur 1 – Akteur 2 – Häufigkeit“ als tsv-Datei, wobei für diese Relationslisten verschiedene Parametrisierungen vorgenommen werden können: 
* A) Akteure (also NEs) in einem Satz (abfragbar über sentenceID)
* B) Akteure (also NEs) in einem Absatz (abfragbar über paragraphID)
* C) Akteure (also NEs) in einem Absatz advanced, d.h. es werden nur Absätze mit mindestens einer bestimmten Anzahl von Wörtern (z.B. 500 oder 1000) berücksichtigt, wenn ein Absatz länger ist, wird der folgende dazugezählt. Die Anzahl der Wörter sollte einstellbar sein.
* D) Akteure (also NEs) in einem Kapitel (abfragbar über sectionID)

In [202]:
def create_network(df, name, aggregation, mapper, parasize, paralimit):
    """Aggregations should be one of ["SectionId, ParagraphId, SentenceId]"""
    if aggregation == "ParagraphId":
        G = create_network_by_paragraph(df, name, aggregation, mapper, parasize, paralimit)
    if aggregation == "SentenceId":
        G = create_network_by_sentence(df, name, aggregation, mapper)
    return G
    
    
def create_network_by_paragraph(df, name, aggregation, mapper, parasize, paralimit):
    # first we group the tokens by the desired level of aggregation and count them
    ne_occurrences = df.groupby(aggregation)["CorefId"].count()
    # next we filter out any tokens that only appear alone
    # we then extract the indexes where entities co-occur
    ne_cooc_inds = ne_occurrences[ne_occurrences > 1].index.tolist()
    # next follows a helper function that gives us the correct index to extract the entities
    islice = get_index_slice(ne_cooc_inds, aggregation)
    # next we group the co-occuring entities by the desired aggregation level
    # and transform the grouped entities to lists
    segments = (df.loc[islice, :]["CorefId"]
                  .groupby(aggregation)
                  .apply(lambda x: list(x)))
    # in the next block we create the network
    B = nx.Graph()
    # we iterate over the segments (sections/paragraphs/sentences)
    # items are co-occurring entities
    cache = 0
    cached_items = []
    for s, items in segments.iteritems():
        # source: segments
        source = str(s)
        if cache < paralimit:
            cache += parasize.loc[s]
            cached_items.extend(items)
        else:
            # targets: entities
            targets = cached_items
            if source not in B.nodes():
                B.add_node(source, bipartite=0)

            for target in targets:
                if target not in B.nodes():
                    B.add_node(target, bipartite=1)
                B.add_edge(source, target)
            cache = 0
            cached_items = []
    
    segment_nodes = set(n
                        for n, d in B.nodes(data=True)
                        if d['bipartite'] == 0)
    entity_nodes = set(B) - segment_nodes
    nx.is_bipartite(B)
    # last, we project the bipartite graph of segments & entities into a an entity-cooccurrence graph
    G = nx.bipartite.weighted_projected_graph(B, entity_nodes)
    return G
    
def create_network_by_sentence(df, name, aggregation, mapper):
    # first we group the tokens by the desired level of aggregation and count them
    ne_occurrences = df.groupby(aggregation)["CorefId"].count()
    # next we filter out any tokens that only appear alone
    # we then extract the indexes where entities co-occur
    ne_cooc_inds = ne_occurrences[ne_occurrences > 1].index.tolist()
    # next follows a helper function that gives us the correct index to extract the entities
    islice = get_index_slice(ne_cooc_inds, aggregation)
    # next we group the co-occuring entities by the desired aggregation level
    # and transform the grouped entities to lists
    segments = (df.loc[islice, :]["CorefId"]
                  .groupby(aggregation)
                  .apply(lambda x: list(x)))
    # in the next block we create the network
    B = nx.Graph()
    # we iterate over the segments (sections/paragraphs/sentences)
    # items are co-occurring entities
    for s, items in segments.iteritems():
        source = str(s)
        targets = items
        if source not in B.nodes():
            B.add_node(source, bipartite=0)
        
        for target in targets:
            if target not in B.nodes():
                B.add_node(target, bipartite=1)
            B.add_edge(source, target)
    
    segment_nodes = set(n
                        for n, d in B.nodes(data=True)
                        if d['bipartite'] == 0)
    entity_nodes = set(B) - segment_nodes
    nx.is_bipartite(B)
    # last, we project the bipartite graph of segments & entities into a an entity-cooccurrence graph
    G = nx.bipartite.weighted_projected_graph(B, entity_nodes)
    return G
    
def get_index_slice(ne_cooc_inds, aggregation):
    if aggregation == "SectionId":
        islice = pd.IndexSlice[ne_cooc_inds, :, :]
    if aggregation == "ParagraphId":
        islice = pd.IndexSlice[:, ne_cooc_inds, :]        
    if aggregation == "SentenceId":
        islice = pd.IndexSlice[:, :, ne_cooc_inds]      
    return islice

### 4) Ausgabe von Netzwerkmetriken, wie sie z.B. vom Tool dramavis ausgegeben werden, für die unter 3) genannten Netzwerke. 

In [156]:
def get_character_metrics(G):
    personae = G.nodes()
    centralities = pd.DataFrame(index = [p for p in personae])
    centralities.index.name = "name"
    for metric in ['betweenness', 'degree',
                   'closeness', 'closeness_corrected',
                   'strength',
                   'eigenvector_centrality']:
        centralities[metric] = 0
    for char, metric in nx.betweenness_centrality(G).items():
        centralities.loc[char, 'betweenness'] = metric
    for char, metric in nx.degree(G).items():
        centralities.loc[char, 'degree'] = metric
    for char, metric in nx.degree(G, weight="weight").items():
        centralities.loc[char, 'strength'] = metric
    for char, metric in nx.closeness_centrality(G).items():
        centralities.loc[char, 'closeness'] = metric
    for g in nx.connected_component_subgraphs(G):
        for char, metric in nx.closeness_centrality(g).items():
            centralities.loc[char, 'closeness_corrected'] = metric
    try:
        for char, metric in nx.eigenvector_centrality(G,
                                            max_iter=500).items():
            centralities.loc[char, 'eigenvector_centrality'] = metric
    except Exception as e:
        pass
    centralities['avg_distance'] = 1/centralities['closeness']
    centralities['avg_distance_corrected'] = 1/centralities['closeness_corrected']
    return centralities

In [157]:
def get_graph_metrics(G):
    values = {}
    values["charcount"] = len(G.nodes())
    values["edgecount"] = len(G.edges())
    try:
        values["maxdegree"] = max(G.degree().values())
    except:
        print("ValueError: max() arg is an empty sequence")
        values["maxdegree"] = "NaN"

    try:
        values["avgdegree"] = sum(G.degree().values())/len(G.nodes())
    except:
        values["avgdegree"] = "NaN"

    try:
        values["density"] = nx.density(G)
    except:
        values["density"] = "NaN"

    try:
        values["avgpathlength"] = nx.average_shortest_path_length(G)
    except nx.NetworkXError:
        values["avgpathlength"] = "NaN"

    try:
        values["clustering_coefficient"] = nx.average_clustering(G)
    except:
        values["clustering_coefficient"] = "NaN"
    values["connected_components"] = nx.number_connected_components(G)
    components = nx.connected_component_subgraphs(G)
    values["component_sizes"] = [len(c.nodes()) for c in components]
    graph_metrics = pd.DataFrame.from_dict(values)
    return graph_metrics

In [None]:
def plotGraph(G, mapper, figsize=(16, 16), filename=None):
    """
    Plots an individual graph, node size by degree centrality,
    edge size by edge weight.
    """
    labels = mapper.to_dict()['nodename']

    try:
        # for networks with only one node
        d = nx.degree_centrality(G)
        nodesize = [v * 250 for v in d.values()]
    except:
        nodesize = [1 * 250 for n in G.nodes()]

    layout=nx.spring_layout
    pos=layout(G)

    plt.figure(figsize=figsize)
    plt.subplots_adjust(left=0,right=1,bottom=0,top=0.95,wspace=0.01,hspace=0.01)

    # nodes
    nx.draw_networkx_nodes(G,pos,
                            nodelist=G.nodes(),
                            node_color="steelblue",
                            node_size=nodesize,
                            alpha=0.8)
    try:
        weights = [G[u][v]['weight'] for u,v in G.edges()]
    except:
        weights = [1 for u,v in G.edges()]
    nx.draw_networkx_edges(G,pos,
                           with_labels=False,
                           edge_color="grey",
                           width=weights
                        )
    labels = {k:v for k,v in labels.items() if k in G.nodes()}
    if G.order() < 1000:
        nx.draw_networkx_labels(G,pos, labels)
    plt.savefig(filename)
    plt.close("all")

In [213]:
######
# set aggregation level here, SectionId, ParagraphId, SentenceId
# SectionId currently not implemented due to data not available
aggregation = "SentenceId"
paralimit = 1 # minimum of token count for paragraph concatenation
selector = "CorefId" # either NamedEntity or CorefId
######
for f in rawfiles:
    name = "_".join(f.split("_")[:2])
    try:
        df = load_raw(os.path.join(datadir, f))
    except Exception as e:
        print(": ".join([name, str(e)]))
    parasize = df.groupby("ParagraphId").count()["Token"]
    # we filter the token dataframe to include only named entity tokens
    if selector == "NamedEntity":
        df = df[df["NamedEntity"].map(lambda x: x.endswith("PER_CORE"))]
    if selector == "CorefId":
        df = df[df["CorefId"].map(lambda x: x not in ["-0", "-"])]
    # Anwendung von 2)
    get_entity_counts(df, name)
    mapper = get_entity_mapper(df)
    # Anwendung von 3)
    G = create_network(df, name, aggregation, mapper, parasize, paralimit=paralimit)
    # export graph for re-use
    nx.write_gml(G, os.path.join("results", name+".gml"))
    nx.write_edgelist(G, os.path.join("results", name+"_edgelist.csv"),
                      delimiter=";", data=["weight"])
    # Berechnung und Speicherung von 4)
    char_metrics = get_character_metrics(G)
    graph_metrics = get_graph_metrics(G)
    char_metrics.to_csv(os.path.join("results",
                                     "_".join([name,
                                               aggregation,
                                               "char_metrics.csv"])))
    graph_metrics.to_csv(os.path.join("results",
                                     "_".join([name,
                                               aggregation,
                                               "graph_metrics.csv"])))
    plotGraph(G, mapper, filename=os.path.join("results",
                                       "_".join([name,
                                                 aggregation+".svg"])))