# Domain Relevance Evaluation

Comparing different methods to get domain relevant terms 

In [1]:
import pandas as pd
from tqdm import tqdm
from parts import collect, oie, domain_relevance, cleaning

## Initial Load of Background Domains

!!! only needed first time -> choose to export data to resource folder for faster performance in the future !!!

In [None]:
adac_corpus = collect.get_corpus(0,"adac",0,0)

In [None]:
chefkoch_corpus = collect.get_corpus("https://www.chefkoch.de/forum/1,27/Haus-Garten.html","chefkoch",5,0)

In [None]:
car_corpus = collect.get_corpus("https://www.motor-talk.de/forum/audi-80-90-100-200-v8-b158.html","car",3,0)

## Load Background Domains and Extract Terms

In [2]:
adac_domain = collect.load_domain_terms("adac", 10000, 1)

100%|██████████| 2524/2524 [01:22<00:00, 30.75it/s]


deleted time references: 0
deleted date references: 287
deleted links: 77
deleted quotes: 63
deleted ireg expressions: 38
deleted abbreviations: 206


In [3]:
car_domain = collect.load_domain_terms("car", 10000, 1)

100%|██████████| 174/174 [00:17<00:00,  9.83it/s]


deleted time references: 19
deleted date references: 29
deleted links: 89
deleted quotes: 367
deleted ireg expressions: 122
deleted abbreviations: 241


In [4]:
chefkoch_domain = collect.load_domain_terms("chefkoch", 10000, 1)

100%|██████████| 279/279 [01:21<00:00,  3.41it/s]


deleted time references: 0
deleted date references: 692
deleted links: 156
deleted quotes: 419
deleted ireg expressions: 321
deleted abbreviations: 491


## Calculate Metrics 

In [None]:
metric = "tf"

In [None]:
adac_relevance = domain_relevance.get_relevancy(adac_domain, metric)

In [None]:
car_relevance = domain_relevance.get_relevancy(car_domain, metric)

In [None]:
chefkoch_relevance = domain_relevance.get_relevancy(chefkoch_domain, metric)

In [None]:
alpha = 0.5
candidates = set([item for sublist in car_domain for item in sublist])
dw = domain_relevance.get_dw(car_domain, chefkoch_domain, candidates, alpha)

In [None]:
llr = domain_relevance.get_llr(car_domain, chefkoch_domain, candidates)

In [None]:
lor_bg = domain_relevance.get_lor_bg(car_domain, chefkoch_domain, candidates)

## Evaluation of Metrics and Domains

In [None]:
import matplotlib.pyplot as plt

### Distribution of metrics

In [None]:
### Term frequency distribution in car_domain
from collections import Counter
flat_terms = [item for sublist in chefkoch_domain for item in sublist]
tf = Counter(flat_terms)
bins= range(0,15,1)
plt.hist(tf.values(), bins=bins, edgecolor="k")
plt.xticks(bins)
print(min(tf.values()),max(tf.values()))

In [None]:
# distribution of llr, dw, lor, lor_bg values (just exchange for fitting metric)
bins= range(int(min(llr.values()))-1,int(min(llr.values()))+10,1)
plt.hist(llr.values(), bins=bins, edgecolor="k")
plt.xticks(bins)
print(min(llr.values()),max(llr.values()))

In [None]:
# overview of highest and lowest performing terms in metric
pd.Series(tf).sort_values(ascending = False).head(15)

In [None]:
len(candidates)
counter = 0
chefkoch_terms = set([item for sublist in chefkoch_domain for item in sublist])
for term in candidates:
    if term in chefkoch_terms and tf[term] > 1:
        counter += 1
        
counter

## Test Concept Export and Import

In [None]:
#concepts = list(set([item for sublist in adac_domain for item in sublist]))
with open("concepts.txt", "w") as fp:
    fp.writelines('\n'.join(concepts))

In [None]:
with open("concepts.txt", "r") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

## Generate Random Testset and Test Concept generation

In [None]:
#from random import sample
#
#testset = sample(candidates, 1000)

In [5]:
with open("testset.txt", "r") as f:
    testset = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
testset = [x.strip() for x in testset] 

In [6]:
import csv
reader = csv.reader(open('testset_labeled.csv', 'r'),delimiter=';')
labeled = {}
for row in reader:
    k,v = row
    labeled[k] = v

In [31]:
labels = domain_relevance.label_concepts(car_domain, adac_domain, chefkoch_domain, "llr")

100%|██████████| 174/174 [00:00<00:00, 39089.92it/s]
100%|██████████| 279/279 [00:00<00:00, 21276.56it/s]


Chosen via background domain: 974
Chosen via metric: 187
Chosen via tf > 1 limit: 1927


In [32]:
predicted = {}
for candidate in testset:
    predicted[candidate] = labels[candidate]

In [33]:
import pandas as pd

df = pd.DataFrame.from_dict(labeled,orient='index', columns = ["label"])
df["predicted"] = predicted.values()

In [34]:
from sklearn.metrics import confusion_matrix, classification_report

In [35]:
confusion_matrix(pd.to_numeric(df["label"]), pd.to_numeric(df["predicted"]))

array([[ 53, 112],
       [ 22, 313]])

In [36]:
print(classification_report(pd.to_numeric(df["label"]), pd.to_numeric(df["predicted"])))

              precision    recall  f1-score   support

           0       0.71      0.32      0.44       165
           1       0.74      0.93      0.82       335

    accuracy                           0.73       500
   macro avg       0.72      0.63      0.63       500
weighted avg       0.73      0.73      0.70       500



### CRCTL domain relevance measure

In [38]:
shared_target_domain, shared_contrastive_domain = domain_relevance.get_shared_domain(car_domain, chefkoch_domain)

100%|██████████| 174/174 [00:00<00:00, 43219.76it/s]
100%|██████████| 279/279 [00:00<00:00, 23230.45it/s]


In [40]:
candidates = set([item for sublist in car_domain for item in sublist])

In [49]:
### macht einfach keinen sinn der bumms

def get_lambda(target_domain, contrastive_domain, candidates):
    lambda_metric = {}
    
    target_tf = domain_relevance.get_tf(target_domain, 1)
    contrast_tf = domain_relevance.get_tf(contrastive_domain, 1)
    
    target_len = len([item for sublist in target_domain for item in sublist])
    contrast_len = len([item for sublist in contrastive_domain for item in sublist])
    
    for term in candidates:
        a = target_tf[term]
        b = contrast_tf[term]
        
        n1 = target_len
        n2 = contrast_len
        
        p = (a+b) / (n1 + n2)
        p1 = a / n1
        p2 = a / n2
        
        lambda_metric[term] = ( p ** a * (1-p) ** (n1-a) * p ** b * (1-p) ** (n2-b) ) / ( p1 ** a * (1-p1) ** (n1-a) * p2 ** b * (1-p2) ** (n2-b) )
    
    return lambda_metric