# Term Extraction: ADAC Rückrufe
https://www.adac.de/infotestrat/reparatur-pflege-und-wartung/rueckrufe/

In [1]:
import spacy
import requests as req
from bs4 import BeautifulSoup
from parts import collect, preprocessing, oie, domain_relevance

nlp = spacy.load("de_core_news_sm")

In [127]:
# provide link with make given and open option for model
# i.e. "https://www.adac.de/infotestrat/reparatur-pflege-und-wartung/rueckrufe/suchergebnis.aspx?Kategorie=Pkw&Hersteller=Audi&Modelle="

from tqdm import tqdm

def get_adac_links():
    link = "https://www.adac.de/infotestrat/reparatur-pflege-und-wartung/rueckrufe/suchergebnis.aspx?Kategorie=Pkw"
    make_list = []
    links = []
    
    r = req.get(link)
    soup = BeautifulSoup(r.text, "html.parser")
    makes = soup.find("select", {"class":"w190"})
    for option in makes.find_all("option"):
            make_list.append(option.text)
    
    make_list.remove("Alle Hersteller")
    make_list.remove("")
    
    for make in tqdm(make_list):
        make_link = link+"&Hersteller="+make
        r = req.get(make_link)
        soup = BeautifulSoup(r.text, "html.parser")
        models = soup.find("select", {"class":"w190 left"})
        if models:
            for option in models.find_all("option"):
                links.append(make_link+"&Modelle="+option.text)
        
    return links

In [128]:
links = get_adac_links()

100%|██████████| 72/72 [00:21<00:00,  3.34it/s]


In [130]:
def get_text_adac(link):
    meldung = set()
    
    r = req.get(link)
    soup = BeautifulSoup(r.text, "html.parser")
    
    for p in soup.find_all("p", {"class": "pl13"}):
        meldung.add(p.text)
        
    return meldung

In [131]:
from tqdm import tqdm

def get_adac_domain(links):
    meldungen = []
    
    for link in tqdm(links):
        meldungen.extend(get_text_adac(link))
    
    return set(meldungen)

In [132]:
adac_domain = get_adac_domain(links)

100%|██████████| 2670/2670 [11:58<00:00,  3.71it/s]


In [133]:
len(adac_domain)

2719

In [162]:
def get_terms(corpus):
    terms = []
    
    doc = nlp(corpus.lower())
    
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop:
            terms.append(token.lemma_)
    
    return terms

In [163]:
terms = [get_terms(doc) for doc in adac_domain]

In [164]:
from collections import Counter

def get_tf(terms):
    flat_terms = [item for sublist in terms for item in sublist]
    tf = Counter(flat_terms)
    max_freq = Counter(flat_terms).most_common(1)[0][1]
    for t in tf:
        tf[t] = (tf[t]/max_freq)
    
    return tf

In [201]:
import numpy as np

def get_idf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    idf = Counter(flat_terms)
    for t in idf:
        idf[t] = np.log2(len(terms)/idf[t])
    
    return idf

In [208]:
import numpy as np

def get_tdf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    tdf = Counter(flat_terms)
    for t in tdf:
        tdf[t] = tdf[t]/len(terms)
    
    return tdf

In [166]:
tf = get_tf(terms)
idf = get_idf(terms)

In [167]:
tf_idf = {}

for term in set([item for sublist in terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]

In [172]:
import pandas as pd

df = pd.Series(tf)

In [176]:
df.sort_values(ascending = False).head(20)

aktion                        1.000000
kunden                        0.936317
stunden                       0.561962
händler                       0.511188
fahrzeugen                    0.343373
abhilfe                       0.272806
hersteller                    0.259897
fahrzeuge                     0.222031
werkstatt                     0.220310
code                          0.216007
hersteller-werkstattsystem    0.192771
folgen                        0.157487
fahrzeug                      0.148021
fall                          0.142857
ausfall                       0.126506
eintrag                       0.104991
maßnahme                      0.097246
austausch                     0.092083
bereich                       0.086919
unfall                        0.086919
dtype: float64

In [170]:
df.reset_index().to_csv("adac_terms.csv", index = False, header=True, sep = ';')

In [202]:
def post_term_cleaning(terms):
    # clean zitate, datum, zeichen/ zahlen only terms, zeit, links
    clean_terms = []
    time, date, link, zitat, ireg, abbr = 0, 0, 0, 0, 0, 0
    abbreviations = preprocessing.get_abbr()

    for doc in terms:
        clean_doc = []
        for term in doc:
            if len(term) < 2:
                pass
            elif re.search(r"\d\d:\d\d:\d\d", term):
                time += 1
            elif re.search(r"\d\d.\d\d.\d\d\d\d", term):
                date += 1
            elif re.search("https://", term):
                link += 1
            elif re.search("@", term):
                zitat += 1
            elif term in abbreviations:
                abbr += 1
            elif re.search(r"\w", term):
                clean_doc.append(term)
            elif re.search(r"^\W", term):
                ireg += 1
            else:
                print(term)
        clean_terms.append(clean_doc)

    print("deleted time references:", time)
    print("deleted date references:", date)
    print("deleted links:", link)
    print("deleted quotes:", zitat)
    print("deleted ireg expressions:", ireg)
    print("deleted abbreviations:", abbr)

    return clean_terms

In [203]:
clean_terms = post_term_cleaning(terms)

deleted time references: 0
deleted date references: 118
deleted links: 1
deleted quotes: 8
deleted ireg expressions: 2
deleted abbreviations: 35


In [209]:
tf = get_tf(clean_terms)
idf = get_idf(clean_terms)
tdf = get_tdf(clean_terms)

In [210]:
tf_idf = {}
tf_tdf = {}

for term in set([item for sublist in clean_terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]
    tf_tdf[term] = tf[term]*tdf[term]

In [211]:
import pandas as pd

df = pd.Series(tf_tdf)

In [212]:
df.reset_index().to_csv("adac_terms.csv", index = False, header=True, sep = ';')