# Domain Relevancy Sample

In [81]:
from collections import Counter
import numpy as np

## Generation of two sample datasets
Each set consists of a list of documents which each contain a list of terms defining the document. Storing the domains as "list of lists" allows to calculate term-frequency as well as term-document-frequency or inverse-document-frequency more easily in later stages.

In [57]:
doc1 = ["hallo", "auto", "problem", "grüße"]
doc2 = ["hallo", "auto", "sensor", "fehler"]
doc3 = ["hallo", "reifen", "fehler", "dank"]
doc4 = ["hi", "sensor", "kaputt", "grüße"]

target_domain = [doc1, doc2, doc3, doc4]

In [58]:
doc1 = ["hallo", "haus", "garten", "grüße"]
doc2 = ["hallo", "sommer", "haus", "grüße"]
doc3 = ["hi", "regen", "pflanzen", "dank"]
doc4 = ["hi", "garten", "zaun", "grüße"]

contrastive_domain = [doc1, doc2, doc3]

# Calculating Relevancy Measures
Presented are three approaches: 
1. combinations of term-frequency, term-document-frequency, inverse-term-document-frequency
2. domain relevance / domain consensus measure from "Ontolearn" System (link!!!)
3. log-likelihood measure from textbook 

### 1a) normalized term-frequency
tf = term-frequency / max_term-frequency

**idea:** terms that appear most are imporant 

In [70]:
def get_tf(terms):
    flat_terms = [item for sublist in terms for item in sublist]
    tf = Counter(flat_terms)
    max_freq = Counter(flat_terms).most_common(1)[0][1]
    for t in tf:
        tf[t] = (tf[t] / max_freq)

    return tf

In [75]:
get_tf(target_domain)

Counter({'hallo': 1.0,
         'auto': 0.6666666666666666,
         'problem': 0.3333333333333333,
         'grüße': 0.6666666666666666,
         'sensor': 0.6666666666666666,
         'fehler': 0.6666666666666666,
         'reifen': 0.3333333333333333,
         'dank': 0.3333333333333333,
         'hi': 0.3333333333333333,
         'kaputt': 0.3333333333333333})

### 1b) normalized term-document-frequency
tdf = term-document-frequency / max_term-document-frequency

**idea:** terms that appear often in docuemts are important

In [78]:
def get_tdf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    tdf = Counter(flat_terms)
    max_freq = Counter(flat_terms).most_common(1)[0][1]
    for t in tdf:
        tdf[t] = tdf[t] / max_freq

    return tdf

In [77]:
get_tdf(target_domain)

Counter({'hallo': 1.0,
         'auto': 0.6666666666666666,
         'problem': 0.3333333333333333,
         'grüße': 0.6666666666666666,
         'fehler': 0.6666666666666666,
         'sensor': 0.6666666666666666,
         'reifen': 0.3333333333333333,
         'dank': 0.3333333333333333,
         'kaputt': 0.3333333333333333,
         'hi': 0.3333333333333333})

### 1c) inverse-term-document-frequency
idf = log2( 1 / tdf[])

**idea:** terms that appear too much are irrelevant

(commonly used to filter out words like is, have, i, you)

In [82]:
def get_idf(terms):
    flat_terms = [item for sublist in terms for item in set(sublist)]
    idf = Counter(flat_terms)
    for t in idf:
        idf[t] = np.log2(len(terms) / idf[t])

    return idf

In [83]:
get_idf(contrastive_domain)

Counter({'hallo': 0.5849625007211562,
         'haus': 0.5849625007211562,
         'grüße': 0.5849625007211562,
         'garten': 1.584962500721156,
         'sommer': 1.584962500721156,
         'pflanzen': 1.584962500721156,
         'dank': 1.584962500721156,
         'hi': 1.584962500721156,
         'regen': 1.584962500721156})

### 2) domain relevance / domain consensus measure (Ontolearn)
DR = target-frequency / contrastive-frequency 

DC = target-tdf * log2( 1 / target-tdf )

DW = alpha * DR + (1-alpha) * DC

**idea:** domain relevane (DR) determines whether a term is more important to the target or contrastive domain, domain consensus determines whether that term is generic (i.e. appears a lot in every domain) - final result is a combination of both measures 

In [97]:
def get_dr(target_domain, contrastive_domain, candidates):
    # candidates should be part of the target domain
    dr = {}
    target_tf = get_tf(target_domain)
    contrastive_tf = get_tf(contrastive_domain)

    for term in candidates:
        try:
            dr[term] = target_tf[term] / contrastive_tf[term]
        except ZeroDivisionError:
            dr[term] = 0

    return dr


def get_dc(target_domain, candidates):
    dc = {}
    target_tdf = get_tdf(target_domain)

    for term in candidates:
        dc[term] = target_tdf[term]*np.log2(1/target_tdf[term])

    return dc


def get_dw(target_domain, contrastive_domain, candidates, alpha):
    dw = {}
    dr = get_dr(target_domain, contrastive_domain, candidates)
    dc = get_dc(target_domain, candidates)

    for term in candidates:
        dw[term] = alpha * dr[term] + (1 - alpha) * dc[term]

    return dw

In [98]:
candidates = set([item for sublist in terms for item in sublist])
get_dw(target_domain, contrastive_domain, candidates, 0.5)

{'hi': 0.5974937501201927,
 'kaputt': 0.2641604167868593,
 'hallo': 0.5,
 'dank': 0.5974937501201927,
 'fehler': 0.1949875002403854,
 'grüße': 0.5283208335737187,
 'sensor': 0.1949875002403854,
 'reifen': 0.2641604167868593,
 'problem': 0.2641604167868593,
 'auto': 0.1949875002403854}

### 3) Log-likelihood-ratio
llr = log( target-tf ) - log( contrastive-tf )

In [103]:
def get_llr(target_domain, contrastive_domain, candidates):
    # candidates should be part of the target domain
    llr = {}
    target_tf = get_tf(target_domain)
    contrastive_tf = get_tf(contrastive_domain)
    
    for term in candidates: 
        target_tf[term] = 0.1 if not target_tf[term] else target_tf[term]
        contrastive_tf[term] = 0.1 if not contrastive_tf[term] else contrastive_tf[term]
        
        llr[term] = np.log(target_tf[term]) - np.log(contrastive_tf[term])
 
    return llr

In [104]:
get_llr(target_domain, contrastive_domain, candidates)

{'hi': -0.4054651081081645,
 'kaputt': 1.2039728043259357,
 'hallo': 0.0,
 'dank': -0.4054651081081645,
 'fehler': 1.897119984885881,
 'grüße': -0.40546510810816444,
 'sensor': 1.897119984885881,
 'reifen': 1.2039728043259357,
 'problem': 1.2039728043259357,
 'auto': 1.897119984885881}

In [None]:
def get_concepts(candidates, target_domain, contrastive_domain):
    concepts = set()
    
    for candidate in candidates:
        try:
            contrastive_domain[candidate]
        except KeyError:
            contrastive_domain[candidate] = 0
            
        if adac_terms[candidate] > chefkoch_terms[candidate]:
            concepts.add(candidate)
    return concepts