# Chefkoch Term Extraction

In [2]:
import spacy
import requests as req
from bs4 import BeautifulSoup
from parts import collect, preprocessing, oie, domain_relevance

nlp = spacy.load("de_core_news_sm")

In [43]:
def get_chefkoch_links(topic):
    links = []
    r = req.get(topic)
    soup = BeautifulSoup(r.text, "html.parser")
    
    for link in soup.find_all("a", {"class":"search-result-title"}):
        links.append("https://www.chefkoch.de"+link["href"])
    
    while soup.find("a", {"class":"pagination-item pagination-next"}):
        page = "https://www.chefkoch.de" + soup.find("a", {"class":"pagination-item pagination-next"})["href"]
        r = req.get(page)
        soup = BeautifulSoup(r.text, "html.parser")
        for link in soup.find_all("a", {"class":"search-result-title"}):
            links.append("https://www.chefkoch.de"+link["href"])
        break
           
    return links

In [44]:
topic = "https://www.chefkoch.de/forum/1,27/Haus-Garten.html"

links = get_chefkoch_links(topic)

In [47]:
# dirty fix -> link is broken 

links[82] = "https://www.chefkoch.de/forum/2,22,768891/Darf-ein-Coronakranker-Post-verschicken.html"

In [48]:
from tqdm import tqdm

def get_chefkoch_domain(links):
    domain = []
    
    for link in tqdm(links):
        chefkoch_text = " ; ".join(collect.get_text_cook(link))
        if len(chefkoch_text) < 100000 and chefkoch_text:
            domain.append(chefkoch_text)
    
    return domain

In [49]:
chefkoch_domain = get_chefkoch_domain(links)

100%|██████████| 100/100 [01:54<00:00,  1.14s/it]


In [51]:
len(chefkoch_domain)

93

In [52]:
def get_terms(corpus):
    terms = []
    
    doc = nlp(corpus.lower())
    
    for token in doc:
        if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop:
            terms.append(token.lemma_)
    
    return terms

In [53]:
terms = [get_terms(doc) for doc in tqdm(chefkoch_domain)]

100%|██████████| 93/93 [00:26<00:00,  3.49it/s]


In [54]:
from collections import Counter

def get_tf(terms):
    flat_terms = [item for sublist in terms for item in sublist]
    tf = Counter(flat_terms)
    max_freq = Counter(flat_terms).most_common(1)[0][1]
    for t in tf:
        tf[t] = (tf[t]/max_freq)
    
    return tf

In [94]:
import numpy as np

def get_idf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    idf = Counter(flat_terms)
    for t in idf:
        idf[t] = np.log2(len(terms)/idf[t])
    
    return idf

In [106]:
import numpy as np

def get_tdf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    tdf = Counter(flat_terms)
    for t in tdf:
        tdf[t] = tdf[t]/len(terms)
    
    return tdf

In [95]:
tf = get_tf(terms)
idf = get_idf(terms)

In [96]:
tf_idf = {}

for term in set([item for sublist in terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]

In [97]:
import pandas as pd

df = pd.Series(tf)

In [98]:
df.sort_values(ascending = False).head(15)

hallo            1.000000
grüße            0.211635
wasser           0.205617
maschine         0.178536
katir            0.177533
lg               0.169509
problem          0.152457
putz             0.148445
waschmaschine    0.146439
garen            0.142427
vögel            0.137412
farbe            0.134403
pflanzen         0.123370
pflanze          0.119358
fragen           0.118355
dtype: float64

In [99]:
df.reset_index().to_csv("chefkoch_terms.csv", index = False, header=True, sep = ';')

In [100]:
def post_term_cleaning(terms):
    # clean zitate, datum, zeichen/ zahlen only terms, zeit, links
    clean_terms = []
    time, date, link, zitat, ireg, abbr = 0, 0, 0, 0, 0, 0
    abbreviations = preprocessing.get_abbr()

    for doc in terms:
        clean_doc = []
        for term in doc:
            if len(term) < 2:
                pass
            elif re.search(r"\d\d:\d\d:\d\d", term):
                time += 1
            elif re.search(r"\d\d.\d\d.\d\d\d\d", term):
                date += 1
            elif re.search("https://", term):
                link += 1
            elif re.search("@", term):
                zitat += 1
            elif term in abbreviations:
                abbr += 1
            elif re.search(r"\w", term):
                clean_doc.append(term)
            elif re.search(r"^\W", term):
                ireg += 1
            else:
                print(term)
        clean_terms.append(clean_doc)

    print("deleted time references:", time)
    print("deleted date references:", date)
    print("deleted links:", link)
    print("deleted quotes:", zitat)
    print("deleted ireg expressions:", ireg)
    print("deleted abbreviations:", abbr)

    return clean_terms

In [101]:
clean_terms = post_term_cleaning(terms)

deleted time references: 0
deleted date references: 179
deleted links: 31
deleted quotes: 67
deleted ireg expressions: 60
deleted abbreviations: 110


In [108]:
tf = get_tf(clean_terms)
idf = get_idf(clean_terms)
tdf = get_tdf(clean_terms)

In [109]:
tf_idf = {}
tf_tdf = {}

for term in set([item for sublist in clean_terms for item in sublist]):
    tf_idf[term] = tf[term]*idf[term]
    tf_tdf[term] = tf[term]*tdf[term]

In [111]:
import pandas as pd

df = pd.Series(tf_tdf)

In [112]:
df.reset_index().to_csv("chefkoch_terms.csv", index = False, header=True, sep = ';')