# Autoforum term extraction


In [1]:
import spacy
import requests as req
from tqdm import tqdm 
from bs4 import BeautifulSoup
from parts import collect, preprocessing, oie, domain_relevance

nlp = spacy.load("de_core_news_sm")

In [2]:
topic = "https://www.motor-talk.de/forum/audi-80-90-100-200-v8-b158.html"

car_links = collect.get_links_car(topic)

In [None]:
# nicht relevant und bricht programm 
#car_links.remove("https://www.motor-talk.de/forum/faq-linksammlung-audi-80-90-100-t593977.html")

In [25]:
from tqdm import tqdm

def get_car_domain(links):
    domain = []
    
    for link in tqdm(links):
        car_text = " ; ".join(collect.get_text_car("https://www.motor-talk.de"+link))
        if len(car_text) < 100000 and car_text:
            domain.append(car_text)
    
    return domain

In [None]:
#car_domain = get_car_domain(car_links)

In [205]:
### export car_domain to files
path = "resources/car_domain/"
counter = 0

for post in car_domain:
    with open(path+str(counter)+".txt", "w") as file:
        file.write(post)
    counter += 1

In [2]:
### import car_domain from files
path = "resources/car_domain/"
counter = 10
car_domain = []

while counter:
    try:
        with open(path+str(counter)+".txt", "r") as file:
            car_domain.append(file.read())
        counter -= 1
    except:
        break

In [3]:
len(car_domain)

10

In [88]:
def get_oie(corpus):
    # decision logic for extracting roots and terms - for better analysis sentences are passed as well

    roots = []
    terms = []
    sents = []

    doc = nlp(corpus.lower())

    for sent in doc.sents:
        t = set()
        # get sentences
        sents.append(sent.text)

        # get important tokens from sentence
        pd, oc, ng = "", "", ""
        for token in sent:
            if token.dep_ == "pd":
                pd = token.lemma_
            if token.dep_ == "oc":
                oc = token.lemma_
            if token.dep_ == "ng" and token.head.dep_ == "ROOT":
                ng = token.lemma_
            if token.pos_ == "NOUN":
                t.add(token.text)
            if token.pos_ == "PROPN":
                t.add(token.text)

        for chunk in sent.noun_chunks:
            c = []
            for token in chunk:
                if not token.is_stop and not token.pos_ == "DET":
                    c.append(token.text)
            if len(c)>1:
                t.add(" ".join(c))

        # get roots / predicate depending on sentence structure
        r = []
        if ng:
            r.append(ng)
        if sent.root.pos_ == "AUX" and pd:
            r.append(pd)
        if sent.root.pos_ == "AUX" and oc:
            r.append(oc)
        else:
            r.append(sent.root.lemma_)

        roots.append(' '.join(r))
        terms.append(t)

    return roots, terms, sents

In [123]:
test_terms = []

for doc in tqdm(car_domain):
    doc_terms = []
    roots, terms, sents = get_oie(doc)
    for t in terms:
        for term in t:
            doc_terms.append(term)
    
    test_terms.append(doc_terms)


  0%|          | 0/10 [00:00<?, ?it/s][A
 50%|█████     | 5/10 [00:00<00:00, 31.46it/s][A
 70%|███████   | 7/10 [00:00<00:00, 12.14it/s][A
100%|██████████| 10/10 [00:00<00:00, 13.96it/s][A


In [140]:
### create label list of terms

roots_tolabel = []
terms_tolabel = []
sents_tolabel = []

for doc in tqdm(car_domain):
    roots, terms, sents = get_oie(doc)
    for r, t, s in zip(roots, terms, sents):
        for term in t:
            roots_tolabel.append(r)
            terms_tolabel.append(term)
            sents_tolabel.append(s)



  0%|          | 0/10 [00:00<?, ?it/s][A
 40%|████      | 4/10 [00:00<00:00, 37.84it/s][A
 60%|██████    | 6/10 [00:00<00:00, 26.99it/s][A
 80%|████████  | 8/10 [00:00<00:00, 11.64it/s][A
100%|██████████| 10/10 [00:00<00:00, 13.18it/s][A


In [141]:
### export label list

import pandas as pd

df = pd.DataFrame(roots_tolabel, columns=["roots"])
df["terms"] = pd.DataFrame(terms_tolabel)
df["sents"] = pd.DataFrame(sents_tolabel)

df.reset_index().to_csv("terms_tolabel.csv", index = False, header=True, sep = ';')

In [5]:
from collections import Counter

def get_tf(terms):
    flat_terms = [item for sublist in terms for item in sublist]
    tf = Counter(flat_terms)
    max_freq = Counter(flat_terms).most_common(1)[0][1]
    for t in tf:
        tf[t] = (tf[t]/max_freq)
    
    return tf

In [6]:
import numpy as np

def get_idf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    idf = Counter(flat_terms)
    for t in idf:
        idf[t] = np.log2(len(terms)/idf[t])
    
    return idf

In [7]:
import numpy as np

def get_tdf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    tdf = Counter(flat_terms)
    for t in tdf:
        tdf[t] = tdf[t]/len(terms)
    
    return tdf

In [161]:
tf = get_tf(terms)
idf = get_idf(terms)

In [43]:
tf_idf = {}

for term in set([item for sublist in terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]

In [49]:
import pandas as pd

df = pd.Series(tf)

In [50]:
df.sort_values(ascending = False).head(15)

hallo       1.00
motor       0.80
audi        0.74
zitat       0.65
mfg         0.64
getriebe    0.50
problem     0.50
auto        0.47
stecker     0.38
fragen      0.35
probleme    0.32
gruß        0.28
danke       0.28
b4          0.27
mai         0.27
dtype: float64

In [47]:
df.reset_index().to_csv("car_terms.csv", index = False, header=True, sep = ';')

In [None]:
### import term list to label to genereate prelabeled list

counter = 0   
reader = csv.reader(open('car_terms_labeled.csv', 'r'), delimiter=';')
terms = []
for row in reader:
    _, k, v, _ = row
    counter += 1
    try:
        labeled[k] = float(v)
    except:
        labeled[k] = v

In [128]:
import re

def post_term_cleaning(terms, label):
    # clean zitate, datum, zeichen/ zahlen only terms, zeit, links
    clean_terms = []
    labels = []
    time, date, link, zitat, ireg, abbr = 0, 0, 0, 0, 0, 0
    abbreviations = preprocessing.get_abbr()

    for doc in terms:
        clean_doc = []
        doc_labels = {}
        for term in doc:
            doc_labels[term] = 0
            if len(term) < 2:
                pass
            elif re.search(r"\d\d:\d\d:\d\d", term):
                time += 1
            elif re.search(r"\d\d.\d\d.\d\d\d\d", term):
                date += 1
            elif re.search("www", term):
                link += 1
            elif re.search("@", term):
                zitat += 1
            elif term in abbreviations:
                abbr += 1
            elif re.search(r"\w", term):
                clean_doc.append(term)
                doc_labels[term] = 1
            elif re.search(r"^\W", term):
                ireg += 1
            else:
                print(term)
                clean_doc.append(term)
                doc_labels[term] = 1
        
        clean_terms.append(clean_doc)
        labels.append(doc_labels)
        

    print("deleted time references:", time)
    print("deleted date references:", date)
    print("deleted links:", link)
    print("deleted quotes:", zitat)
    print("deleted ireg expressions:", ireg)
    print("deleted abbreviations:", abbr)
    
    if label:
        return labels
    else:
        return clean_terms

In [134]:
clean_terms = post_term_cleaning(test_terms, 0)

deleted time references: 3
deleted date references: 1
deleted links: 6
deleted quotes: 5
deleted ireg expressions: 0
deleted abbreviations: 9


In [133]:
prelabeled_terms = post_term_cleaning(test_terms, 1)

deleted time references: 3
deleted date references: 1
deleted links: 6
deleted quotes: 5
deleted ireg expressions: 0
deleted abbreviations: 9


In [135]:
df = pd.DataFrame([item for sublist in prelabeled_terms for item in sublist])
df["pred"] = pd.DataFrame([sublist[item] for sublist in prelabeled_terms for item in sublist])

df.to_csv("prelabeled_terms.csv", index = False, header=True, sep = ';')

In [136]:
tf = get_tf(clean_terms)
idf = get_idf(clean_terms)
tdf = get_tdf(clean_terms)

In [137]:
tf_idf = {}
tf_tdf = {}

for term in set([item for sublist in clean_terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]
    tf_tdf[term] = tf[term]*tdf[term]

In [138]:
import pandas as pd

df = pd.Series(tf_tdf)

In [139]:
df.reset_index().to_csv("car_terms.csv", index = False, header=True, sep = ';')

In [197]:
print(post_term_cleaning(["12.12.2020","lol","hallo"]))

deleted time references: 0
deleted date references: 0
deleted links: 0
deleted quotes: 0
deleted ireg expressions: 0
deleted abbreviations: 0
[[], [], []]
