# Domain Relevance Evaluation

Comparing different methods to get domain relevant terms 

In [1]:
from tqdm import tqdm
from parts import collect, oie, domain_relevance, preprocessing

## Initial Load of Background Domains

In [None]:
# adac domain

def get_corpus(topic, domain, limit, export):
    corpus = []
    path = "resources/txt/"
    counter = 0
    links = collect.get_links(topic, domain, limit)
    for link in tqdm(links):
        if collect.get_text(link, domain):
            text = " ; ".join(collect.get_text(link, domain))
            if len(text) < 100000 and text:
                corpus.append(text)
                if export:
                    with open(path+domain+"/"+str(counter)+".txt", "w") as file:
                        file.write(text)
                    counter += 1
    return corpus

In [None]:
adac_corpus = get_corpus(0,"adac",0,1)

In [None]:
chefkoch_corpus = get_corpus("https://www.chefkoch.de/forum/1,27/Haus-Garten.html","chefkoch",5,1)

In [None]:
car_corpus = get_corpus("https://www.motor-talk.de/forum/audi-80-90-100-200-v8-b158.html","car",3,1)

## Load Background Domains and Extract Terms

In [85]:
from collections import Counter

def clean_uniques(terms):
    clean_terms = []
    counter = 0
    tf = Counter(terms)
    for term in tf:
        if tf[term] > 1:
            clean_terms.append(term)
        else:
            counter += 1
    print(counter,"terms removed due to uniques")
    return clean_terms

In [86]:
### import car_domain from files

def load_domain_terms(domain, limit, clean = 0, del_unique = 0):
    counter = 0
    path = "resources/txt/"
    corpus = []
    terms = []
    while counter < limit:
        try:
            with open(path+domain+"/"+str(counter)+".txt", "r") as file:
                corpus.append(file.read())
            counter += 1
        except:
            break
    
    for doc in tqdm(corpus):
        if del_unique:
            doc_terms = clean_uniques(oie.get_terms(doc))
        else:
            doc_terms = oie.get_terms(doc)
        terms.append(doc_terms)
    if clean:    
        terms = preprocessing.post_term_cleaning(terms)
        
    return terms

In [62]:
adac_domain = load_domain_terms("adac", 10000, 1, 0)

100%|██████████| 2524/2524 [01:20<00:00, 31.19it/s]


deleted time references: 0
deleted date references: 287
deleted links: 77
deleted quotes: 63
deleted ireg expressions: 38
deleted abbreviations: 206


In [87]:
car_domain = load_domain_terms("car", 10000, 1, 1)

  2%|▏         | 3/174 [00:00<00:08, 21.04it/s]

33 terms removed due to uniques
8 terms removed due to uniques
72 terms removed due to uniques
9 terms removed due to uniques
15 terms removed due to uniques


  5%|▍         | 8/174 [00:00<00:08, 19.47it/s]

98 terms removed due to uniques
50 terms removed due to uniques
67 terms removed due to uniques


  6%|▌         | 10/174 [00:00<00:13, 12.31it/s]

219 terms removed due to uniques
43 terms removed due to uniques
13 terms removed due to uniques


  7%|▋         | 12/174 [00:00<00:14, 11.17it/s]

130 terms removed due to uniques
35 terms removed due to uniques


 10%|█         | 18/174 [00:01<00:14, 10.83it/s]

368 terms removed due to uniques
54 terms removed due to uniques
8 terms removed due to uniques
6 terms removed due to uniques
32 terms removed due to uniques


 13%|█▎        | 22/174 [00:01<00:16,  9.21it/s]

247 terms removed due to uniques
57 terms removed due to uniques
73 terms removed due to uniques
44 terms removed due to uniques


 14%|█▍        | 24/174 [00:02<00:14, 10.36it/s]

35 terms removed due to uniques
99 terms removed due to uniques
76 terms removed due to uniques


 15%|█▍        | 26/174 [00:02<00:19,  7.43it/s]

316 terms removed due to uniques
132 terms removed due to uniques


 16%|█▌        | 28/174 [00:02<00:18,  7.71it/s]

57 terms removed due to uniques
155 terms removed due to uniques


 18%|█▊        | 32/174 [00:03<00:15,  9.27it/s]

50 terms removed due to uniques
108 terms removed due to uniques
28 terms removed due to uniques


 20%|██        | 35/174 [00:03<00:13, 10.43it/s]

84 terms removed due to uniques
15 terms removed due to uniques
67 terms removed due to uniques
38 terms removed due to uniques


 22%|██▏       | 38/174 [00:03<00:12, 10.81it/s]

53 terms removed due to uniques
172 terms removed due to uniques
44 terms removed due to uniques


 23%|██▎       | 40/174 [00:03<00:14,  9.51it/s]

176 terms removed due to uniques
125 terms removed due to uniques


 26%|██▌       | 45/174 [00:04<00:11, 10.75it/s]

112 terms removed due to uniques
35 terms removed due to uniques
41 terms removed due to uniques
23 terms removed due to uniques
22 terms removed due to uniques


 27%|██▋       | 47/174 [00:04<00:10, 12.20it/s]

65 terms removed due to uniques
13 terms removed due to uniques


 28%|██▊       | 49/174 [00:04<00:11, 11.18it/s]

135 terms removed due to uniques


 29%|██▉       | 51/174 [00:04<00:13,  9.12it/s]

176 terms removed due to uniques
29 terms removed due to uniques
46 terms removed due to uniques
34 terms removed due to uniques


 33%|███▎      | 57/174 [00:05<00:10, 11.57it/s]

54 terms removed due to uniques
8 terms removed due to uniques
57 terms removed due to uniques
120 terms removed due to uniques


 34%|███▍      | 59/174 [00:05<00:11,  9.83it/s]

64 terms removed due to uniques
175 terms removed due to uniques


 35%|███▌      | 61/174 [00:05<00:09, 11.42it/s]

61 terms removed due to uniques
16 terms removed due to uniques


 36%|███▌      | 63/174 [00:06<00:14,  7.47it/s]

226 terms removed due to uniques
92 terms removed due to uniques
54 terms removed due to uniques


 37%|███▋      | 65/174 [00:06<00:13,  8.31it/s]

71 terms removed due to uniques
16 terms removed due to uniques
15 terms removed due to uniques


 39%|███▉      | 68/174 [00:06<00:11,  9.51it/s]

81 terms removed due to uniques
61 terms removed due to uniques
27 terms removed due to uniques


 41%|████      | 71/174 [00:06<00:11,  8.89it/s]

154 terms removed due to uniques


 42%|████▏     | 73/174 [00:07<00:13,  7.66it/s]

222 terms removed due to uniques
87 terms removed due to uniques
17 terms removed due to uniques
55 terms removed due to uniques


 45%|████▍     | 78/174 [00:07<00:09, 10.58it/s]

26 terms removed due to uniques
84 terms removed due to uniques
18 terms removed due to uniques


 46%|████▌     | 80/174 [00:07<00:09,  9.61it/s]

31 terms removed due to uniques
149 terms removed due to uniques


 49%|████▉     | 86/174 [00:08<00:10,  8.45it/s]

293 terms removed due to uniques
35 terms removed due to uniques
18 terms removed due to uniques
21 terms removed due to uniques
20 terms removed due to uniques
25 terms removed due to uniques
103 terms removed due to uniques


 51%|█████     | 88/174 [00:08<00:13,  6.29it/s]

126 terms removed due to uniques
15 terms removed due to uniques
41 terms removed due to uniques


 52%|█████▏    | 91/174 [00:09<00:11,  7.04it/s]

145 terms removed due to uniques


 53%|█████▎    | 93/174 [00:09<00:15,  5.34it/s]

154 terms removed due to uniques
31 terms removed due to uniques
45 terms removed due to uniques


 56%|█████▌    | 97/174 [00:10<00:10,  7.22it/s]

125 terms removed due to uniques
57 terms removed due to uniques
55 terms removed due to uniques
42 terms removed due to uniques


 58%|█████▊    | 101/174 [00:10<00:08,  9.12it/s]

45 terms removed due to uniques
114 terms removed due to uniques
53 terms removed due to uniques


 60%|█████▉    | 104/174 [00:10<00:06, 10.13it/s]

22 terms removed due to uniques
12 terms removed due to uniques
126 terms removed due to uniques


 61%|██████    | 106/174 [00:11<00:07,  9.29it/s]

63 terms removed due to uniques
137 terms removed due to uniques


 63%|██████▎   | 109/174 [00:11<00:05, 11.07it/s]

47 terms removed due to uniques
37 terms removed due to uniques
66 terms removed due to uniques
18 terms removed due to uniques
20 terms removed due to uniques


 67%|██████▋   | 116/174 [00:11<00:04, 13.11it/s]

216 terms removed due to uniques
9 terms removed due to uniques
22 terms removed due to uniques
41 terms removed due to uniques
102 terms removed due to uniques


 68%|██████▊   | 118/174 [00:11<00:03, 14.41it/s]

19 terms removed due to uniques
78 terms removed due to uniques
63 terms removed due to uniques


 71%|███████   | 123/174 [00:11<00:02, 17.22it/s]

63 terms removed due to uniques
15 terms removed due to uniques
44 terms removed due to uniques
34 terms removed due to uniques


 72%|███████▏  | 125/174 [00:12<00:03, 12.98it/s]

215 terms removed due to uniques
23 terms removed due to uniques
38 terms removed due to uniques


 76%|███████▋  | 133/174 [00:12<00:02, 15.44it/s]

66 terms removed due to uniques
20 terms removed due to uniques
8 terms removed due to uniques
19 terms removed due to uniques
22 terms removed due to uniques
18 terms removed due to uniques
24 terms removed due to uniques
26 terms removed due to uniques


 78%|███████▊  | 136/174 [00:12<00:02, 16.30it/s]

43 terms removed due to uniques
83 terms removed due to uniques


 80%|███████▉  | 139/174 [00:13<00:03,  9.84it/s]

318 terms removed due to uniques
73 terms removed due to uniques
55 terms removed due to uniques
38 terms removed due to uniques
8 terms removed due to uniques
16 terms removed due to uniques


 84%|████████▍ | 146/174 [00:13<00:02, 12.68it/s]

177 terms removed due to uniques
42 terms removed due to uniques
30 terms removed due to uniques
5 terms removed due to uniques
29 terms removed due to uniques


 86%|████████▌ | 150/174 [00:14<00:01, 12.22it/s]

189 terms removed due to uniques
143 terms removed due to uniques
27 terms removed due to uniques


 89%|████████▉ | 155/174 [00:14<00:01, 13.67it/s]

150 terms removed due to uniques
54 terms removed due to uniques
26 terms removed due to uniques
17 terms removed due to uniques
59 terms removed due to uniques


 90%|█████████ | 157/174 [00:14<00:01, 12.76it/s]

90 terms removed due to uniques
42 terms removed due to uniques
42 terms removed due to uniques


 91%|█████████▏| 159/174 [00:14<00:01,  8.88it/s]

212 terms removed due to uniques


 93%|█████████▎| 161/174 [00:15<00:01,  7.54it/s]

205 terms removed due to uniques
81 terms removed due to uniques
78 terms removed due to uniques


 95%|█████████▌| 166/174 [00:15<00:00,  9.37it/s]

183 terms removed due to uniques
16 terms removed due to uniques
59 terms removed due to uniques
71 terms removed due to uniques
39 terms removed due to uniques


 97%|█████████▋| 168/174 [00:15<00:00,  9.05it/s]

147 terms removed due to uniques
87 terms removed due to uniques


 98%|█████████▊| 170/174 [00:16<00:00,  8.95it/s]

153 terms removed due to uniques
211 terms removed due to uniques


 99%|█████████▉| 172/174 [00:16<00:00,  5.88it/s]

343 terms removed due to uniques


100%|██████████| 174/174 [00:17<00:00,  9.93it/s]

210 terms removed due to uniques
166 terms removed due to uniques





deleted time references: 2
deleted date references: 4
deleted links: 12
deleted quotes: 70
deleted ireg expressions: 13
deleted abbreviations: 51


In [88]:
chefkoch_domain = load_domain_terms("chefkoch", 10000, 1, 1)

  1%|          | 2/279 [00:00<00:24, 11.48it/s]

55 terms removed due to uniques
158 terms removed due to uniques


  1%|          | 3/279 [00:01<01:58,  2.34it/s]

0 terms removed due to uniques


  2%|▏         | 6/279 [00:01<01:28,  3.08it/s]

0 terms removed due to uniques
64 terms removed due to uniques
51 terms removed due to uniques


  3%|▎         | 8/279 [00:02<01:09,  3.87it/s]

56 terms removed due to uniques
124 terms removed due to uniques
90 terms removed due to uniques


  4%|▎         | 10/279 [00:03<01:31,  2.95it/s]

0 terms removed due to uniques
32 terms removed due to uniques


  4%|▍         | 12/279 [00:03<01:12,  3.70it/s]

203 terms removed due to uniques


  5%|▌         | 14/279 [00:04<01:14,  3.54it/s]

0 terms removed due to uniques
98 terms removed due to uniques
66 terms removed due to uniques


  6%|▌         | 17/279 [00:04<00:55,  4.68it/s]

0 terms removed due to uniques
98 terms removed due to uniques


  7%|▋         | 19/279 [00:05<01:13,  3.56it/s]

1 terms removed due to uniques
130 terms removed due to uniques


  8%|▊         | 21/279 [00:05<01:00,  4.24it/s]

31 terms removed due to uniques
151 terms removed due to uniques


  8%|▊         | 22/279 [00:06<01:15,  3.39it/s]

0 terms removed due to uniques


  9%|▉         | 25/279 [00:06<01:13,  3.47it/s]

0 terms removed due to uniques
74 terms removed due to uniques
28 terms removed due to uniques
71 terms removed due to uniques


 10%|█         | 29/279 [00:07<00:43,  5.68it/s]

89 terms removed due to uniques
54 terms removed due to uniques
78 terms removed due to uniques
45 terms removed due to uniques


 12%|█▏        | 34/279 [00:07<00:27,  8.82it/s]

48 terms removed due to uniques
59 terms removed due to uniques
62 terms removed due to uniques
22 terms removed due to uniques


 13%|█▎        | 36/279 [00:08<01:13,  3.29it/s]

0 terms removed due to uniques
104 terms removed due to uniques


 14%|█▎        | 38/279 [00:09<01:25,  2.81it/s]

0 terms removed due to uniques
76 terms removed due to uniques


 15%|█▍        | 41/279 [00:10<00:58,  4.05it/s]

205 terms removed due to uniques
31 terms removed due to uniques
104 terms removed due to uniques


 15%|█▌        | 42/279 [00:10<00:49,  4.82it/s]

87 terms removed due to uniques
19 terms removed due to uniques


 16%|█▌        | 44/279 [00:11<01:22,  2.85it/s]

1 terms removed due to uniques


 16%|█▌        | 45/279 [00:12<01:49,  2.14it/s]

1 terms removed due to uniques
12 terms removed due to uniques


 17%|█▋        | 47/279 [00:12<01:33,  2.49it/s]

1 terms removed due to uniques
87 terms removed due to uniques


 18%|█▊        | 49/279 [00:13<01:13,  3.13it/s]

162 terms removed due to uniques
25 terms removed due to uniques


 19%|█▊        | 52/279 [00:14<01:21,  2.80it/s]

0 terms removed due to uniques
154 terms removed due to uniques
50 terms removed due to uniques
11 terms removed due to uniques


 20%|██        | 56/279 [00:15<00:55,  4.00it/s]

0 terms removed due to uniques
139 terms removed due to uniques
80 terms removed due to uniques


 21%|██        | 58/279 [00:16<00:57,  3.85it/s]

0 terms removed due to uniques
40 terms removed due to uniques


 22%|██▏       | 61/279 [00:16<00:47,  4.62it/s]

0 terms removed due to uniques
154 terms removed due to uniques


 23%|██▎       | 65/279 [00:16<00:30,  7.11it/s]

50 terms removed due to uniques
69 terms removed due to uniques
71 terms removed due to uniques
36 terms removed due to uniques
0 terms removed due to uniques


 24%|██▍       | 67/279 [00:19<01:33,  2.26it/s]

0 terms removed due to uniques


 25%|██▌       | 70/279 [00:19<01:05,  3.19it/s]

0 terms removed due to uniques
54 terms removed due to uniques
112 terms removed due to uniques


 25%|██▌       | 71/279 [00:19<00:56,  3.69it/s]

156 terms removed due to uniques


 26%|██▌       | 72/279 [00:20<01:23,  2.48it/s]

0 terms removed due to uniques


 27%|██▋       | 74/279 [00:20<00:58,  3.53it/s]

0 terms removed due to uniques
96 terms removed due to uniques


 27%|██▋       | 75/279 [00:21<00:48,  4.23it/s]

140 terms removed due to uniques
33 terms removed due to uniques


 28%|██▊       | 79/279 [00:21<00:33,  5.95it/s]

1 terms removed due to uniques
67 terms removed due to uniques
32 terms removed due to uniques


 29%|██▉       | 81/279 [00:22<00:57,  3.44it/s]

0 terms removed due to uniques
164 terms removed due to uniques


 30%|██▉       | 83/279 [00:22<00:43,  4.50it/s]

120 terms removed due to uniques
106 terms removed due to uniques


 30%|███       | 85/279 [00:23<00:44,  4.37it/s]

2 terms removed due to uniques
102 terms removed due to uniques
72 terms removed due to uniques


 31%|███       | 87/279 [00:23<00:35,  5.47it/s]

71 terms removed due to uniques


 32%|███▏      | 90/279 [00:24<00:32,  5.84it/s]

0 terms removed due to uniques
66 terms removed due to uniques
86 terms removed due to uniques


 33%|███▎      | 91/279 [00:24<00:42,  4.43it/s]

0 terms removed due to uniques
46 terms removed due to uniques


 33%|███▎      | 93/279 [00:24<00:40,  4.63it/s]

1 terms removed due to uniques
81 terms removed due to uniques


 34%|███▍      | 95/279 [00:25<00:42,  4.30it/s]

0 terms removed due to uniques
58 terms removed due to uniques


 35%|███▍      | 97/279 [00:25<00:35,  5.07it/s]

142 terms removed due to uniques


 35%|███▌      | 98/279 [00:26<01:25,  2.12it/s]

0 terms removed due to uniques


 35%|███▌      | 99/279 [00:26<01:12,  2.47it/s]

1 terms removed due to uniques


 36%|███▌      | 100/279 [00:27<01:16,  2.34it/s]

2 terms removed due to uniques


 36%|███▌      | 101/279 [00:27<01:08,  2.59it/s]

0 terms removed due to uniques


 37%|███▋      | 104/279 [00:28<00:54,  3.19it/s]

0 terms removed due to uniques
67 terms removed due to uniques
52 terms removed due to uniques


 38%|███▊      | 105/279 [00:29<01:15,  2.31it/s]

0 terms removed due to uniques


 38%|███▊      | 106/279 [00:29<01:05,  2.64it/s]

0 terms removed due to uniques


 39%|███▉      | 109/279 [00:30<01:15,  2.25it/s]

0 terms removed due to uniques
7 terms removed due to uniques
86 terms removed due to uniques
82 terms removed due to uniques


 40%|███▉      | 111/279 [00:31<01:10,  2.40it/s]

0 terms removed due to uniques
82 terms removed due to uniques


 41%|████      | 113/279 [00:31<00:59,  2.78it/s]

2 terms removed due to uniques
29 terms removed due to uniques


 42%|████▏     | 117/279 [00:32<00:37,  4.34it/s]

0 terms removed due to uniques
42 terms removed due to uniques
54 terms removed due to uniques


 42%|████▏     | 118/279 [00:32<00:35,  4.50it/s]

0 terms removed due to uniques


 43%|████▎     | 121/279 [00:33<00:47,  3.36it/s]

1 terms removed due to uniques
44 terms removed due to uniques
60 terms removed due to uniques


 44%|████▎     | 122/279 [00:33<00:49,  3.18it/s]

1 terms removed due to uniques


 44%|████▍     | 123/279 [00:33<00:47,  3.27it/s]

2 terms removed due to uniques


 45%|████▌     | 126/279 [00:34<00:40,  3.81it/s]

1 terms removed due to uniques
66 terms removed due to uniques
73 terms removed due to uniques


 46%|████▌     | 127/279 [00:34<00:35,  4.30it/s]

0 terms removed due to uniques


 46%|████▌     | 128/279 [00:35<00:39,  3.82it/s]

1 terms removed due to uniques
37 terms removed due to uniques
56 terms removed due to uniques


 47%|████▋     | 131/279 [00:35<00:34,  4.33it/s]

1 terms removed due to uniques


 47%|████▋     | 132/279 [00:35<00:34,  4.26it/s]

0 terms removed due to uniques
17 terms removed due to uniques
22 terms removed due to uniques


 48%|████▊     | 135/279 [00:36<00:30,  4.78it/s]

0 terms removed due to uniques
43 terms removed due to uniques
28 terms removed due to uniques


 50%|█████     | 140/279 [00:36<00:19,  6.98it/s]

0 terms removed due to uniques
28 terms removed due to uniques
107 terms removed due to uniques


 51%|█████     | 142/279 [00:37<00:35,  3.87it/s]

0 terms removed due to uniques
113 terms removed due to uniques


 52%|█████▏    | 144/279 [00:37<00:31,  4.32it/s]

220 terms removed due to uniques
118 terms removed due to uniques


 52%|█████▏    | 146/279 [00:38<00:26,  4.99it/s]

3 terms removed due to uniques
81 terms removed due to uniques


 53%|█████▎    | 147/279 [00:38<00:44,  2.95it/s]

0 terms removed due to uniques
45 terms removed due to uniques


 53%|█████▎    | 149/279 [00:39<00:44,  2.89it/s]

1 terms removed due to uniques
64 terms removed due to uniques


 54%|█████▍    | 151/279 [00:39<00:37,  3.45it/s]

0 terms removed due to uniques
35 terms removed due to uniques
68 terms removed due to uniques


 55%|█████▌    | 154/279 [00:41<00:49,  2.50it/s]

0 terms removed due to uniques


 56%|█████▌    | 156/279 [00:43<01:16,  1.60it/s]

1 terms removed due to uniques
174 terms removed due to uniques


 56%|█████▋    | 157/279 [00:44<00:58,  2.07it/s]

168 terms removed due to uniques
60 terms removed due to uniques


 57%|█████▋    | 159/279 [00:44<00:49,  2.44it/s]

1 terms removed due to uniques
12 terms removed due to uniques


 58%|█████▊    | 161/279 [00:44<00:39,  3.02it/s]

0 terms removed due to uniques


 58%|█████▊    | 162/279 [00:45<00:35,  3.28it/s]

0 terms removed due to uniques


 59%|█████▉    | 165/279 [00:45<00:32,  3.50it/s]

0 terms removed due to uniques
58 terms removed due to uniques
46 terms removed due to uniques


 59%|█████▉    | 166/279 [00:45<00:27,  4.16it/s]

100 terms removed due to uniques


 60%|█████▉    | 167/279 [00:46<00:34,  3.29it/s]

0 terms removed due to uniques


 61%|██████    | 170/279 [00:48<00:55,  1.95it/s]

1 terms removed due to uniques
78 terms removed due to uniques
72 terms removed due to uniques


 61%|██████▏   | 171/279 [00:48<00:44,  2.43it/s]

147 terms removed due to uniques
34 terms removed due to uniques


 62%|██████▏   | 173/279 [00:48<00:36,  2.91it/s]

3 terms removed due to uniques


 62%|██████▏   | 174/279 [00:49<00:40,  2.60it/s]

1 terms removed due to uniques
108 terms removed due to uniques


 63%|██████▎   | 177/279 [00:51<00:43,  2.35it/s]

1 terms removed due to uniques
137 terms removed due to uniques


 64%|██████▍   | 178/279 [00:51<00:37,  2.72it/s]

1 terms removed due to uniques
69 terms removed due to uniques


 65%|██████▍   | 181/279 [00:52<00:28,  3.47it/s]

2 terms removed due to uniques
112 terms removed due to uniques


 65%|██████▌   | 182/279 [00:52<00:24,  4.00it/s]

109 terms removed due to uniques


 66%|██████▌   | 184/279 [00:52<00:25,  3.73it/s]

0 terms removed due to uniques
100 terms removed due to uniques


 66%|██████▋   | 185/279 [00:54<00:49,  1.89it/s]

0 terms removed due to uniques


 67%|██████▋   | 187/279 [00:54<00:32,  2.85it/s]

2 terms removed due to uniques
95 terms removed due to uniques


 67%|██████▋   | 188/279 [00:54<00:25,  3.61it/s]

84 terms removed due to uniques


 68%|██████▊   | 190/279 [00:55<00:29,  2.97it/s]

0 terms removed due to uniques
120 terms removed due to uniques


 69%|██████▉   | 192/279 [00:55<00:25,  3.48it/s]

0 terms removed due to uniques
101 terms removed due to uniques


 69%|██████▉   | 193/279 [00:57<00:54,  1.57it/s]

0 terms removed due to uniques


 70%|██████▉   | 194/279 [00:57<00:44,  1.89it/s]

235 terms removed due to uniques
74 terms removed due to uniques


 70%|███████   | 196/279 [00:58<00:40,  2.04it/s]

0 terms removed due to uniques


 71%|███████   | 198/279 [00:58<00:26,  3.03it/s]

0 terms removed due to uniques
135 terms removed due to uniques


 72%|███████▏  | 201/279 [00:59<00:17,  4.55it/s]

96 terms removed due to uniques
80 terms removed due to uniques
61 terms removed due to uniques


 73%|███████▎  | 203/279 [00:59<00:13,  5.50it/s]

59 terms removed due to uniques
128 terms removed due to uniques


 74%|███████▍  | 206/279 [01:00<00:23,  3.09it/s]

2 terms removed due to uniques
52 terms removed due to uniques
34 terms removed due to uniques


 75%|███████▍  | 208/279 [01:00<00:18,  3.87it/s]

65 terms removed due to uniques
119 terms removed due to uniques


 75%|███████▍  | 209/279 [01:00<00:14,  4.70it/s]

83 terms removed due to uniques


 75%|███████▌  | 210/279 [01:01<00:22,  3.06it/s]

0 terms removed due to uniques


 76%|███████▌  | 211/279 [01:02<00:29,  2.33it/s]

2 terms removed due to uniques
60 terms removed due to uniques


 76%|███████▋  | 213/279 [01:02<00:24,  2.73it/s]

0 terms removed due to uniques


 77%|███████▋  | 214/279 [01:03<00:36,  1.78it/s]

0 terms removed due to uniques


 77%|███████▋  | 215/279 [01:04<00:36,  1.76it/s]

0 terms removed due to uniques


 77%|███████▋  | 216/279 [01:04<00:30,  2.05it/s]

0 terms removed due to uniques


 78%|███████▊  | 217/279 [01:05<00:33,  1.88it/s]

0 terms removed due to uniques


 78%|███████▊  | 218/279 [01:05<00:26,  2.27it/s]

1 terms removed due to uniques
42 terms removed due to uniques


 79%|███████▉  | 221/279 [01:05<00:17,  3.34it/s]

2 terms removed due to uniques
89 terms removed due to uniques


 80%|███████▉  | 223/279 [01:06<00:13,  4.01it/s]

1 terms removed due to uniques
165 terms removed due to uniques


 81%|████████  | 225/279 [01:06<00:10,  5.30it/s]

83 terms removed due to uniques
124 terms removed due to uniques
42 terms removed due to uniques


 82%|████████▏ | 229/279 [01:06<00:06,  7.86it/s]

89 terms removed due to uniques
64 terms removed due to uniques
66 terms removed due to uniques


 83%|████████▎ | 231/279 [01:07<00:08,  5.94it/s]

0 terms removed due to uniques
36 terms removed due to uniques
49 terms removed due to uniques


 84%|████████▍ | 235/279 [01:09<00:11,  3.68it/s]

0 terms removed due to uniques
75 terms removed due to uniques
64 terms removed due to uniques


 85%|████████▍ | 237/279 [01:09<00:08,  4.81it/s]

68 terms removed due to uniques
46 terms removed due to uniques


 86%|████████▌ | 239/279 [01:09<00:08,  4.56it/s]

2 terms removed due to uniques
129 terms removed due to uniques


 87%|████████▋ | 242/279 [01:10<00:09,  4.00it/s]

2 terms removed due to uniques
58 terms removed due to uniques
43 terms removed due to uniques


 87%|████████▋ | 244/279 [01:10<00:08,  4.12it/s]

0 terms removed due to uniques
96 terms removed due to uniques
57 terms removed due to uniques


 89%|████████▊ | 247/279 [01:11<00:07,  4.55it/s]

0 terms removed due to uniques
59 terms removed due to uniques


 89%|████████▉ | 248/279 [01:12<00:09,  3.28it/s]

1 terms removed due to uniques


 90%|████████▉ | 250/279 [01:12<00:08,  3.47it/s]

0 terms removed due to uniques
162 terms removed due to uniques


 90%|████████▉ | 251/279 [01:13<00:08,  3.30it/s]

0 terms removed due to uniques


 90%|█████████ | 252/279 [01:13<00:10,  2.54it/s]

0 terms removed due to uniques


 91%|█████████ | 253/279 [01:14<00:10,  2.51it/s]

1 terms removed due to uniques
72 terms removed due to uniques


 91%|█████████▏| 255/279 [01:14<00:07,  3.05it/s]

0 terms removed due to uniques


 92%|█████████▏| 257/279 [01:15<00:06,  3.40it/s]

0 terms removed due to uniques
109 terms removed due to uniques
58 terms removed due to uniques


 93%|█████████▎| 260/279 [01:15<00:04,  4.23it/s]

0 terms removed due to uniques
131 terms removed due to uniques


 94%|█████████▎| 261/279 [01:16<00:06,  2.98it/s]

1 terms removed due to uniques


 94%|█████████▍| 263/279 [01:16<00:04,  3.91it/s]

1 terms removed due to uniques
125 terms removed due to uniques
67 terms removed due to uniques


 96%|█████████▌| 268/279 [01:17<00:02,  4.73it/s]

0 terms removed due to uniques
32 terms removed due to uniques
52 terms removed due to uniques
112 terms removed due to uniques
50 terms removed due to uniques


 97%|█████████▋| 270/279 [01:18<00:02,  3.79it/s]

1 terms removed due to uniques
69 terms removed due to uniques


 97%|█████████▋| 272/279 [01:18<00:01,  4.32it/s]

0 terms removed due to uniques


 98%|█████████▊| 273/279 [01:18<00:01,  4.38it/s]

130 terms removed due to uniques


 98%|█████████▊| 274/279 [01:18<00:01,  3.78it/s]

0 terms removed due to uniques


 99%|█████████▊| 275/279 [01:19<00:01,  3.52it/s]

1 terms removed due to uniques


 99%|█████████▉| 276/279 [01:19<00:00,  3.60it/s]

0 terms removed due to uniques


100%|██████████| 279/279 [01:20<00:00,  3.48it/s]

0 terms removed due to uniques
63 terms removed due to uniques
35 terms removed due to uniques





deleted time references: 0
deleted date references: 168
deleted links: 38
deleted quotes: 165
deleted ireg expressions: 109
deleted abbreviations: 94


## Calculate Metrics 

In [89]:
def get_relevancy(terms, metric):
    tf = domain_relevance.get_tf(terms)
    idf = domain_relevance.get_idf(terms)
    tdf = domain_relevance.get_tdf(terms)
    tf_tdf = {}
    
    if metric == "tf":
        return tf
    elif metric == "tf_tdf":
        for term in set([item for sublist in terms for item in sublist]):
            tf_tdf[term] = tf[term]*tdf[term]
        return tf_tdf
    elif metric == "tf_idf":
        for term in set([item for sublist in terms for item in sublist]):
            tf_idf[term] = tf[term]*idf[term]
        return tf_idf

    else:
        return False
    

In [90]:
metric = "tf_tdf"

In [91]:
adac_relevance = get_relevancy(adac_domain, metric)

In [92]:
car_relevance = get_relevancy(car_domain, metric)

In [93]:
chefkoch_relevance = get_relevancy(chefkoch_domain, metric)

## Select Concepts from Background Knowledge

In [123]:
def get_concepts(candidates, car_terms, adac_terms, chefkoch_terms):
    concepts = set()
    counter1 = 0
    counter2 = 0
    
    for candidate in candidates:
        if adac_terms[candidate] and not chefkoch_terms[candidate]:
            concepts.add(candidate)
            counter1 += 1
        if car_terms[candidate] and not chefkoch_terms[candidate]:
            concepts.add(candidate)
            counter2 += 1
        #else:
            #print(candidate)
            
    print(counter1, counter2)
    return concepts

In [124]:
candidates = set([item for sublist in car_domain for item in sublist])

In [125]:
concepts = get_concepts(candidates, car_relevance, adac_relevance, chefkoch_relevance)

322 2005


In [126]:
print(len(candidates), "VS.", len(concepts))

2737 VS. 2005


In [129]:
concepts

{'3000 €',
 'kraftstoffdruckregler',
 'durchführungen',
 'wasserkastenabdeckung',
 'aob',
 'scheuerstelle',
 'zulassungsstelle',
 '19 zöller',
 'bosch classic',
 'jp',
 'zahnstange',
 'audi_90_quattro_20v',
 'inus',
 'dichtringe',
 'classic',
 'weg ziel',
 'kühlmittelanschlüsse',
 'quarantäne',
 'kreislauf',
 '0,5 mm^2',
 'sportlenkrad',
 'moly',
 'regerge',
 'kompression',
 'anfängerauto',
 'ausführlichen anleitung',
 'flitzer',
 'audi90',
 'abschliessen',
 'simple vorderachse',
 'letzten 6 wochen',
 'stellelement',
 'ot markierung',
 'leuchtmittel',
 'mitm',
 '13d lenkrad',
 'ng',
 'kunststoffspachtelmasse',
 'schaumstoffblock',
 'kombieinstrument',
 'warmem motor',
 'ablaufschlauch',
 'pumpenrelais',
 'klimatronik',
 '4 komplette heber',
 'übrigen leitungen',
 'schaufelrad',
 'neueren modellen',
 '2 austauschlenkgetriebe',
 'simmerring',
 'ensetzten',
 'knarren',
 'o-ringe',
 '5er',
 'windschutzscheibenblende',
 'ölfilter adapter',
 'wärmetauscher',
 'zv-pumpe',
 'dranhängen',
 'ot'

In [128]:
#concepts = list(set([item for sublist in adac_domain for item in sublist]))
with open("concepts.txt", "w") as fp:
    fp.writelines('\n'.join(concepts))

In [115]:
with open("concepts.txt", "r") as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content] 

content

['3000 €',
 'kraftstoffdruckregler',
 'durchführungen',
 'wasserkastenabdeckung',
 'aob',
 'scheuerstelle',
 'zulassungsstelle',
 '19 zöller',
 'bosch classic',
 'jp',
 'zahnstange',
 'audi_90_quattro_20v',
 'inus',
 'dichtringe',
 'drinne',
 'classic',
 'weg ziel',
 'kühlmittelanschlüsse',
 'quarantäne',
 'kreislauf',
 'italien',
 '0,5 mm^2',
 'sportlenkrad',
 'moly',
 'regerge',
 'laufzeit',
 'kompression',
 'anfängerauto',
 'ausführlichen anleitung',
 'flitzer',
 'modelle',
 'audi90',
 'abschliessen',
 'gruss',
 'simple vorderachse',
 'letzten 6 wochen',
 'stellelement',
 'quelle',
 'ot markierung',
 'leuchtmittel',
 'fehler',
 'mitm',
 '13d lenkrad',
 'ng',
 'kunststoffspachtelmasse',
 'schaumstoffblock',
 'mühen',
 'kombieinstrument',
 'warmem motor',
 'ablaufschlauch',
 'pumpenrelais',
 'klimatronik',
 '4 komplette heber',
 'übrigen leitungen',
 'schaufelrad',
 'neueren modellen',
 '2 austauschlenkgetriebe',
 'simmerring',
 'manschette',
 'ensetzten',
 'knarren',
 'o-ringe',
 '5e

In [136]:
from random import sample

testset = sample(candidates, 500)

In [139]:
with open("testset.txt", "w") as fp:
    fp.writelines('\n'.join(testset))