# Domain Relevance Evaluation

Comparing different methods to get domain relevant terms 

In [1]:
import pandas as pd
from tqdm import tqdm
from parts import collect, oie, domain_relevance, cleaning

## Initial Load of Background Domains

!!! only needed first time -> choose to export data to resource folder for faster performance in the future !!!

In [None]:
adac_corpus = collect.get_corpus(0,"adac",0,0)

In [None]:
chefkoch_corpus = collect.get_corpus("https://www.chefkoch.de/forum/1,27/Haus-Garten.html","chefkoch",5,0)

In [None]:
car_corpus = collect.get_corpus("https://www.motor-talk.de/forum/audi-80-90-100-200-v8-b158.html","car",3,0)

## Load Background Domains and Extract Terms

In [2]:
adac_domain = collect.load_domain_terms("adac", 10000, 1)

0


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xdf in position 42: invalid continuation byte

In [None]:
car_domain = collect.load_domain_terms("car", 10000, 1)

In [None]:
chefkoch_domain = collect.load_domain_terms("chefkoch", 10000, 1)

## Evaluation of Metrics and Domains

In [None]:
import matplotlib.pyplot as plt

### Distribution of metrics

In [None]:
### Term frequency distribution in car_domain
from collections import Counter
flat_terms = [item for sublist in chefkoch_domain for item in sublist]
tf = Counter(flat_terms)
bins= range(0,15,1)
plt.hist(tf.values(), bins=bins, edgecolor="k")
plt.xticks(bins)
print(min(tf.values()),max(tf.values()))

In [None]:
# distribution of llr, dw, lor, lor_bg values (just exchange for fitting metric)
bins= range(int(min(llr.values()))-1,int(min(llr.values()))+10,1)
plt.hist(llr.values(), bins=bins, edgecolor="k")
plt.xticks(bins)
print(min(llr.values()),max(llr.values()))

In [None]:
# overview of highest and lowest performing terms in metric
pd.Series(tf).sort_values(ascending = False).head(15)

In [None]:
len(candidates)
counter = 0
chefkoch_terms = set([item for sublist in chefkoch_domain for item in sublist])
for term in candidates:
    if term in chefkoch_terms and tf[term] > 1:
        counter += 1
        
counter

## Test Concept Export and Import

In [None]:
#concepts = list(set([item for sublist in adac_domain for item in sublist]))
with open("concepts.txt", "w") as fp:
    fp.writelines('\n'.join(concepts))

In [None]:
with open("concepts.txt", "r") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

## Generate Random Testset and Test Concept generation

In [None]:
#from random import sample
#
#testset = sample(candidates, 1000)

In [None]:
with open("testset.txt", "r", encoding = "utf-8") as f:
    testset = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
testset = [x.strip() for x in testset] 

In [None]:
import csv
reader = csv.reader(open('testset_labeled.csv', 'r', encoding = "utf-8"), delimiter=';')
labeled = {}
for row in reader:
    k,v = row
    labeled[k] = v

In [None]:
labels = domain_relevance.label_concepts(car_domain, adac_domain, chefkoch_domain, "dw")

In [None]:
predicted = {}
for candidate in testset:
    predicted[candidate] = labels[candidate]

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(labeled,orient='index', columns = ["label"])
df["predicted"] = predicted.values()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(pd.to_numeric(df["label"]), pd.to_numeric(df["predicted"]))

In [None]:
print(classification_report(pd.to_numeric(df["label"]), pd.to_numeric(df["predicted"])))