# Results Preperation

In [1]:
import pandas as pd
from tqdm import tqdm
from collections import Counter

from parts import collect, cleaning, oie, domain_relevance

## Collecting
either from stored txt files or topic link in motor forum

In [2]:
# load post files
def load_files(limit):
    counter = 0
    path = "resources/txt/car/"
    corpus = []
    while counter < limit:
        try:
            with open(path + "/" + str(counter) + ".txt", "r", encoding = "utf-8") as file:
                corpus.append(file.read())
            counter += 1
        except FileNotFoundError:
            counter += 1
    return corpus

In [3]:
corpus = load_files(1500)

## OIE
extract terms and root words from each sentence

In [4]:
# precleaning -> delete questions, short sentences
corpus_no_questions = []
for doc in tqdm(corpus):
    clean_doc = cleaning.sentences(doc, 1)
    corpus_no_questions.append(clean_doc)

100%|██████████████████████████████████████████████████████████████████████████████| 1190/1190 [40:44<00:00,  2.05s/it]


In [5]:
roots, terms = [],[]

for post in tqdm(corpus_no_questions):
    post_roots, post_terms, _ = oie.get_oie(post)
    roots.extend(post_roots)
    terms.extend(post_terms)

100%|██████████████████████████████████████████████████████████████████████████████| 1190/1190 [04:26<00:00,  4.47it/s]


In [6]:
backup_roots = roots
backup_terms = terms

In [7]:
# post cleaning of terms
clean_terms = cleaning.terms(terms)

deleted time references: 0
deleted date references: 250
deleted links: 764
deleted quotes: 0
deleted ireg expressions: 0
deleted abbreviations: 1347


## Domain Relevancy
narrow down terms to domain relevant terms

In [8]:
# load concept list from domain relevancy file
with open("audi_concepts.txt", "r") as f:
    content = f.readlines()
concepts = [x.strip() for x in content] 

In [9]:
# load list of german stopwords 
with open("resources/stopwords_ger.txt", "r") as f:
    content = f.readlines()
stop_words = [x.strip() for x in content] 

In [10]:
# get concepts from list and ignoring stopwords
domain_concepts = []  
for sent in clean_terms:
    t = set()
    for term in sent:
        if term in concepts and term not in stop_words:
            t.add(term)
    domain_concepts.append(t)

In [11]:
# dataframe visualisation
terms2 = []
for sent in domain_concepts:
    terms2.append(", ".join(sent))

df = pd.DataFrame(roots, columns=["roots"])
df["terms"] = pd.DataFrame(terms2)

df.head()

Unnamed: 0,roots,terms
0,lg,
1,hab,teile
2,rückleuchtenband,"heckklappengriff, avant, c4, rückleuchtenband"
3,können,
4,können,rückleuchtenband


In [12]:
#delete empty sets
roots_2 = []
terms_2 = []
for root, sent in zip(roots, domain_concepts):
    if sent:
        roots_2.append(root)
        terms_2.append(sent)
roots = roots_2
terms = terms_2

# Association Rule Mining

## Search all relations for most common words

In [None]:
def get_term_relations(terms, roots, window_size=1):
    cooccurrence = dict()
    indicies = set()
    
    for root, sent in zip(roots, terms):
        for term_x in sent:
            for term_y in sent:
                index = term_x + "," + term_y
                reverse_index = term_y + "," + term_x
                if term_x == term_y:
                    pass
                elif reverse_index in indicies:
                    pass
                elif index not in indicies and reverse_index not in indicies:
                    indicies.add(index)
                    cooccurrence[index] = [root]
                else:
                    cooccurrence[index].append(root)
    return cooccurrence

In [None]:
cooccurrence = get_term_relations(terms, roots)

In [None]:
# get most common relation#
from collections import Counter

relations = {}

for index in cooccurrence:
    relations[index] = Counter(cooccurrence[index]).most_common(1)[0][0]

In [None]:
flat_terms = [item for sublist in domain_concepts for item in sublist]
df = pd.Series(Counter(flat_terms))

In [None]:
df.sort_values(ascending= False).head(10)

In [None]:
most_common_terms = Counter(flat_terms).most_common(10)

In [None]:
common_term_x = []
common_term_y = []
common_relation = []

for a in most_common_terms:
    for b in most_common_terms:
        index = a[0] + "," + b[0]
        reverse_index = b[0] + "," + a[0]
        if index in relations:
            common_term_x.append(a[0])
            common_term_y.append(b[0])
            common_relation.append(relations[index])
        elif reverse_index in relations:
            common_term_x.append(a[0])
            common_term_y.append(b[0])
            common_relation.append(relations[reverse_index])
        else: 
            pass

## Search most common relation for seed words

In [35]:
seed_word = "vergleich"
window_size = 2

window_terms = []
window_roots = []
seed_relations = []

for root, sent in zip(roots, terms):
    window_terms.append(sent)
    window_roots.append(root)
    flat_terms = [item for sublist in window_terms for item in sublist]
    if seed_word in flat_terms:
        for window_root, window_sent in zip(window_roots, window_terms):
            for window_term in window_sent:
                if window_term == seed_word or window_term == window_root:
                    pass
                else:
                    relation = window_root + "," + window_term
                    seed_relations.append(relation)
    if len(window_roots) > window_size:
        window_terms.pop(0)
        window_roots.pop(0)

In [36]:
Counter(seed_relations).most_common(10)

[]

In [13]:
def get_seed_relation(terms, roots, seed_word, window_size = 1, limit = 3):
    window_terms = []
    window_roots = []
    seed_relations = []

    for root, sent in zip(roots, terms):
        window_terms.append(sent)
        window_roots.append(root)
        flat_terms = [item for sublist in window_terms for item in sublist]
        if seed_word in flat_terms:
            for window_root, window_sent in zip(window_roots, window_terms):
                for window_term in window_sent:
                    if window_term == seed_word or window_term == window_root:
                        pass
                    else:
                        relation = window_root + "," + window_term
                        seed_relations.append(relation)
        if len(window_roots) > window_size:
            window_terms.pop(0)
            window_roots.pop(0)
    
    return Counter(seed_relations).most_common(limit)

In [14]:
flat_terms = [item for sublist in domain_concepts for item in sublist]

In [15]:
most_common_terms = Counter(flat_terms).most_common(100)

In [16]:
seed_term_x = []
seed_term_y = []
seed_relation = []

for term in most_common_terms:
    seed_relations = get_seed_relation(terms, roots, term[0], 3, 5)
    for relation in seed_relations:
        if relation[1] > 3:
            relation_root, relation_term_y = relation[0].split(",")
            if relation_root != "schreiben":
                seed_term_x.append(term[0])
                seed_term_y.append(relation_term_y)
                seed_relation.append(relation_root)
        else:
            print("min relation number not reached")

## Add GermaNet Information

In [2]:
from germanetpy.germanet import Germanet

path = "D:/Users/STUD_Mann.AIS/GermaNet" 

data_path = path + "/GN_V150/GN_V150_XML"
frequencylist_nouns = path + "/GN_V150/FreqLists/noun_freqs_decow14_16.txt"
germanet = Germanet(data_path)

Load GermaNet data...: 100%|██████████████████████████████████████████▉| 99.99999999999996/100 [00:24<00:00,  4.13it/s]
Load Wictionary data...: 100%|████████████████████████████████████████████████████| 100.0/100 [00:00<00:00, 413.24it/s]
Load Ili records...: 100%|███████████████████████████████████████████████████████| 100.0/100 [00:00<00:00, 9966.74it/s]


In [4]:
germanet.get_synsets_by_orthform("Auto")

[Synset(id=s8813, lexunits=Automobil, Auto, Kraftfahrzeug, Wagen, Kraftwagen, Motorwagen, Motorfahrzeug, Kfz)]

In [18]:
def get_germanetinfo(termlist):
    germanetinfo = []
    for term in termlist:
        terminfo = []
        orthform = term[0].swapcase() + term[1:]
        if germanet.get_synsets_by_orthform(orthform):
            for synset in germanet.get_synsets_by_orthform(orthform):
                #find shortest hierachial path to root (Entität)
                root_path = synset.shortest_path(germanet.get_synset_by_id("s51001"))
                terminfo.append(root_path)
            germanetinfo.append(terminfo)
        else:
            germanetinfo.append(None)
            
    return germanetinfo

In [19]:
seed_term_x_germanetinfo = get_germanetinfo(seed_term_x)
seed_term_y_germanetinfo = get_germanetinfo(seed_term_y)

In [20]:
print(sum(x is not None for x in seed_term_x_germanetinfo))
print(len(seed_term_x_germanetinfo))

303
443


In [21]:
sum_x = sum(x is not None for x in seed_term_x_germanetinfo)
len_x = len(seed_term_x_germanetinfo)
sum_y = sum(x is not None for x in seed_term_y_germanetinfo)
len_y = len(seed_term_y_germanetinfo)

print("found ", sum_x+sum_y, "out of ", len_x+len_y, "terms")

found  645 out of  886 terms


# Visualisation

In [None]:
import networkx as nx 
import matplotlib.pyplot as plt 
%matplotlib inline

G = nx.Graph()
nodeA = seed_term_x
nodeB = seed_term_y
relation = seed_relation

for a, b in zip(nodeA, nodeB):
    G.add_edge(a, b)

pos = nx.spring_layout(G, k=0.15, iterations=20)
plt.figure(figsize=(12,12))
nx.draw(G,pos,width=1,linewidths=1,node_size=500,alpha=0.9,labels={node:node for node in G.nodes()})
labels = dict(zip(list(zip(nodeA, nodeB)),relation))
nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_color='red')

plt.axis('off')
plt.show()

## Export

In [22]:
df = pd.DataFrame(seed_term_x, columns=["Node A"])
df["Node B"] = pd.DataFrame(seed_term_y)
df["Root"] = pd.DataFrame(seed_relation)
df["GermaNet A"] = pd.Series(seed_term_x_germanetinfo)
df["GermaNet B"] = pd.Series(seed_term_y_germanetinfo)

In [23]:
df.head(20)

Unnamed: 0,Node A,Node B,Root,GermaNet A,GermaNet B
0,audi,b4,habe,"[[[Synset(id=s27145, lexunits=Audi), Synset(id...",
1,audi,b4,fahren,"[[[Synset(id=s27145, lexunits=Audi), Synset(id...",
2,audi,km,haben,"[[[Synset(id=s27145, lexunits=Audi), Synset(id...",
3,audi,km,fahren,"[[[Synset(id=s27145, lexunits=Audi), Synset(id...",
4,audi,motor,haben,"[[[Synset(id=s27145, lexunits=Audi), Synset(id...","[[[Synset(id=s139757, lexunits=Motor), Synset(..."
5,motor,audi,habe,"[[[Synset(id=s139757, lexunits=Motor), Synset(...","[[[Synset(id=s27145, lexunits=Audi), Synset(id..."
6,motor,audi,haben,"[[[Synset(id=s139757, lexunits=Motor), Synset(...","[[[Synset(id=s27145, lexunits=Audi), Synset(id..."
7,motor,audi,fahren,"[[[Synset(id=s139757, lexunits=Motor), Synset(...","[[[Synset(id=s27145, lexunits=Audi), Synset(id..."
8,motor,gas,geben,"[[[Synset(id=s139757, lexunits=Motor), Synset(...","[[[Synset(id=s8920, lexunits=Gaspedal, Gas), S..."
9,motor,probleme,haben,"[[[Synset(id=s139757, lexunits=Motor), Synset(...",


In [24]:
df.to_csv("audi_auswertung_top100-w3-l5.csv")