# Forum Post Extraction

Analyze one forum post at a time using the domain relevant concepts found in the domain relevance analysis.

Samples: 
- https://www.motor-talk.de/forum/start-probleme-audi-a6-c4-2-6-abc-t6820263.html?page=1
- https://www.motor-talk.de/forum/bmw-e87-abs-sensor-t6949995.html
- https://www.motor-talk.de/forum/bmw-123d-motor-dreht-startet-aber-nicht-t6516445.html
- https://www.motor-talk.de/forum/bmw-1er-e81-n43-springt-nicht-mehr-an-t6772871.html

In [1]:
import pandas as pd
from tqdm import tqdm
from collections import Counter

from parts import collect, cleaning, oie, domain_relevance

## Collecting

Insert link from car forum motor-talk.de 

In [2]:
link = "https://www.motor-talk.de/forum/start-probleme-audi-a6-c4-2-6-abc-t6820263.html"

p = collect.get_text(link, "car", 1)
print(len(p))

forum_text = " ; ".join(p)


92


## Preprocessing

For more detail see cleaning.py in parts folder

In [3]:
# clean no_questions = TRUE
clean_doc = cleaning.sentences(forum_text, 1)

In [4]:
print(clean_doc[0:200])

Hallo zusammen; mein dicker macht seit kurzem Probleme ; Folgende Probleme ; Fahrzeug startet nicht mehr; Anlasser dreht; Nach einiger Zeit wieder probiert; Fahrzeug startet normal; Am nächsten Tag gl


## Open Information Extraction (OIE)

for more detail see oie.py in parts. 

Dataframe gives overview of extracted terms without domain relevance analysis

In [5]:
roots, terms, sents = oie.get_oie(clean_doc)

terms2 = []
for sent in terms:
    terms2.append(", ".join(sent))

df = pd.DataFrame(roots, columns=["roots"])
df["terms"] = pd.DataFrame(terms2)
df["sents"] = pd.DataFrame(sents)

df.head(15).to_csv("20201001_term-extraction.csv", index = False, header=True, sep = ';')

## Domain Relevancy

Comparing extracted terms to the list of concepts generated in domain relevancy jupyter notebook

In [6]:
with open("concepts.txt", "r") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
concepts = [x.strip() for x in content] 

In [7]:
domain_concepts = []  
for sent in terms:
    t = set()
    for term in sent:
        if term in concepts:
             t.add(term)
    domain_concepts.append(t)

terms2 = []
for sent in domain_concepts:
    terms2.append(", ".join(sent))

df = pd.DataFrame(roots, columns=["roots"])
df["terms"] = pd.DataFrame(terms2)
df["sents"] = pd.DataFrame(sents)

df.head(50)

Unnamed: 0,roots,terms,sents
0,hallo,,hallo zusammen;
1,machen,probleme,mein dicker macht seit kurzem probleme ;
2,probleme,"probleme, folgende probleme",folgende probleme ;
3,nicht starten,fahrzeug,fahrzeug startet nicht mehr;
4,drehen,anlasser,anlasser dreht;
5,probieren,,nach einiger zeit wieder probiert;
6,starten,fahrzeug,fahrzeug startet normal;
7,problem,,am nächsten tag gleiches problem wieder;
8,fehlversuche,,also fehlversuche ;
9,können,,bisher konnte geprüft werden ;


##  Most common relation test on post

Get seed relations is taken from results preperation jupyter notebook

In [None]:
def get_seed_relation(terms, roots, seed_word, window_size = 1, limit = 3):
    window_terms = []
    window_roots = []
    seed_relations = []

    for root, sent in zip(roots, terms):
        window_terms.append(sent)
        window_roots.append(root)
        flat_terms = [item for sublist in window_terms for item in sublist]
        if seed_word in flat_terms:
            for window_root, window_sent in zip(window_roots, window_terms):
                for window_term in window_sent:
                    if window_term == seed_word or window_term == window_root:
                        pass
                    else:
                        relation = window_root + "," + window_term
                        seed_relations.append(relation)
        if len(window_roots) > window_size:
            window_terms.pop(0)
            window_roots.pop(0)
    
    return Counter(seed_relations).most_common(limit)

In [None]:
flat_terms = [item for sublist in domain_concepts for item in sublist]

In [None]:
most_common_terms = Counter(flat_terms).most_common(30)

In [None]:
Counter(flat_terms).most_common(10)

In [None]:
seed_term_x = []
seed_term_y = []
seed_relation = []

for term in most_common_terms:
    seed_relations = get_seed_relation(terms, roots, term[0], 3, 3)
    for relation in seed_relations:
        if relation[1] > 1:
            relation_root, relation_term_y = relation[0].split(",")
            if relation_root != "schreiben":
                seed_term_x.append(term[0])
                seed_term_y.append(relation_term_y)
                seed_relation.append(relation_root)
        else:
            print("min relation number not reached")

In [None]:
df = pd.DataFrame(seed_term_x, columns=["Node A"])
df["Node B"] = pd.DataFrame(seed_term_y)
df["Root"] = pd.DataFrame(seed_relation)

In [None]:
df.head(15)

## Cronological visualization

In [None]:
nodes = []
relation = []
sentence = []

for sent, terms, root in zip(sents, domain_concepts, roots):
    if len(terms) > 1:
        nodes.append(",".join(terms))
        relation.append(root)
        sentence.append(sent)

In [None]:
df2 = pd.DataFrame(nodes, columns=["Nodes"])
df2["Root"] = pd.DataFrame(relation)
df2["Sent"] = pd.DataFrame(sentence)

In [None]:
df2.head(15)

In [None]:
# exporting dataframe for further analysis

df2.to_csv("20200928_bmw-1er-e81-n43-springt-nicht-mehr-an.csv", index = False, header=True, sep = ';')

## Visualization

In [None]:
import networkx as nx 
import matplotlib.pyplot as plt 
%matplotlib inline

G = nx.Graph()
nodeA = []
nodeB = []
relation = []

for sent, root in zip(domain_concepts, roots):
    prev_term = 0
    for term in sent:
        if prev_term:
            G.add_edge(prev_term, term)
            nodeA.append(prev_term)
            nodeB.append(term)
            relation.append(root)
        else:
            prev_term = term
pos = nx.spring_layout(G, k=0.15, iterations=20)
plt.figure(figsize=(12,12))
nx.draw(G,pos,width=1,linewidths=1,node_size=500,alpha=0.9,labels={node:node for node in G.nodes()})
labels = dict(zip(list(zip(nodeA, nodeB)),relation))
nx.draw_networkx_edge_labels(G,pos,edge_labels=labels,font_color='red')

plt.axis('off')
plt.show()