# Forum Extraction

starting with https://www.motor-talk.de/forum/start-probleme-audi-a6-c4-2-6-abc-t6820263.html?page=1

In [None]:
import spacy
import pandas as pd
from spacy.matcher import Matcher 
from spacy.pipeline import Sentencizer
from parts import collect, preprocessing, oie, domain_relevance

In [None]:
nlp = spacy.load("de_core_news_md")

In [None]:
### General SpaCy setup

sentencizer = Sentencizer(punct_chars=[".", "?", "!", ",", ";"])
nlp.add_pipe(sentencizer, name="sentence_segmenter", before="parser")

## Collecting

In [None]:
link = "https://www.motor-talk.de/forum/start-probleme-audi-a6-c4-2-6-abc-t6820263.html"

p = collect.get_text_car(link)
print(len(p))

forum_text = " ; ".join(p)

## Preprocessing

In [None]:
# normalize lowercase = TRUE, remove_stopwords = FALSE
normalized = preprocessing.normalize(forum_text, 1, 0)

# clean no_questions = TRUE
cleaned = preprocessing.clean(normalized, 1)

In [None]:
with open("sentences.txt", 'w') as output:
    output.write(cleaned.replace("; ", "\n"))

## OIE

In [None]:
roots, terms, sents = oie.get_oie(cleaned)

terms2 = []
for sent in terms:
    terms2.append(", ".join(sent))

df = pd.DataFrame(roots, columns=["roots"])
df["terms"] = pd.DataFrame(terms2)
df["sents"] = pd.DataFrame(sents)

df.head(15)

## Domain Relevancy

In [None]:
domain_relevancy = domain_relevance.main(link, terms)

In [None]:
def get_concepts(candidates, threshold):
    concepts = set()

    for candidate in candidates:
        if domain_relevancy[candidate] > threshold:
            concepts.add(candidate)

    return concepts

concepts = get_concepts(domain_relevancy, 0.5)

In [None]:
for sent in terms:
    for term in sent:
        if term not in concepts and not in nlp.Default.stop_words:
            term = ""

terms2 = []
for sent in terms:
    terms2.append(", ".join(sent))

df = pd.DataFrame(roots, columns=["roots"])
df["terms"] = pd.DataFrame(terms2)
df["sents"] = pd.DataFrame(sents)

df.head(20)

## Evaluation

In [None]:
df.to_csv ("20200528_term-extraction.csv", index = False, header=True, sep = ';')

In [None]:
predicted = []
for sent in terms:
    p = 0
    for term in sent:
        if term:
            p = 1
    predicted.append(p)

In [None]:
df = pd.read_csv("resources/20200528_gold-standard-forumpost.csv", delimiter=";", names=["label","sent"])
df["predicted"] = pd.DataFrame(predicted)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(df["label"], df["predicted"])

In [None]:
import networkx as nx 
import matplotlib.pyplot as plt 
%matplotlib inline

G = nx.Graph()
nodeA = []
nodeB = []
relation = []

for sent, root in zip(terms, roots):
    prev_term = 0
    for term in sent:
        if prev_term:
            G.add_edge(prev_term, term)
            nodeA.append(prev_term)
            nodeB.append(term)
            relation.append(root)
        else:
            prev_term = term
pos = nx.spring_layout(G, k=0.15, iterations=20)
plt.figure(figsize=(12,12))
nx.draw(G,pos,width=1,linewidths=1,node_size=500,alpha=0.9,labels={node:node for node in G.nodes()})
labels = dict(zip(list(zip(nodeA, nodeB)),relation))
nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_color='red')

plt.axis('off')
plt.show()