# Load Spacy

In [2]:
import spacy
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter

#spacy.prefer_gpu()
nlp = spacy.load("de_dep_news_trf")
nlp.max_length = 17000000

# Load pre-processed Datasets

In [3]:
train_doc_trf_loaded = pickle.load(open("data/train_doc_trf.p", "rb"))

In [6]:
val_doc_trf_loaded = pickle.load(open("data/val_doc_trf.pickle", "rb"))
test_doc_trf_loaded = pickle.load(open("data/test_doc_trf.pickle", "rb"))

In [7]:
print("Train: ", len(train_doc_trf_loaded))
print("Val:   ", len(val_doc_trf_loaded))
print("Test:  ", len(test_doc_trf_loaded))

Train:  246525
Val:    15588
Test:   15588


## Load tokenized Datasets

In [8]:
# TRAIN

train_tok_text = pickle.load(open("data/train_tok_text.p", "rb"))
train_tok_lemma = pickle.load(open("data/train_tok_lemma.p", "rb"))
train_nouns_text = pickle.load(open("data/train_nouns_text.p", "rb"))
train_nouns_lemma = pickle.load(open("data/train_nouns_lemma.p", "rb"))

# Analyze train Data

In [9]:
print("train words: ", len(train_tok_text))
train_words_freq = Counter(train_tok_text)
print("train words dict: ", len(train_words_freq))

train words:  1138469
train words dict:  167707


In [10]:
train_words_freq_keys = train_words_freq.keys()

train_rare_words = {}

for i in range(1,11):
    train_rare_words[i] = []
    for word in train_words_freq_keys:
        
        if train_words_freq[word] == i:
            train_rare_words[i].append(word)
        
    print("Number of rare words " + str(i) + " : ", len(train_rare_words[i]))

Number of rare words 1 :  91121
Number of rare words 2 :  27144
Number of rare words 3 :  12429
Number of rare words 4 :  7111
Number of rare words 5 :  4604
Number of rare words 6 :  3234
Number of rare words 7 :  2433
Number of rare words 8 :  1914
Number of rare words 9 :  1544
Number of rare words 10 :  1284


In [11]:
pickle.dump(train_rare_words, open("data/train_rare_words.p", "wb"))

In [None]:
train_rare_words = pickle.load(open("data/train_rare_words.p", "rb"))

# Analyse Test Data

In [12]:
test_tok_text = []
for i in tqdm(range(len(test_doc_trf_loaded))):
    test_tok_text = test_tok_text + [token.text for token in test_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]

100%|██████████| 15588/15588 [00:04<00:00, 3574.56it/s]


In [13]:
print("Test words: ", len(test_tok_text))
test_words_freq = Counter(test_tok_text)
print("Test words dict: ", len(test_words_freq))

Test words:  67545
Test words dict:  29553


In [14]:
test_words_freq_keys = test_words_freq.keys()

test_only_words = []

for word in test_words_freq_keys:
    if train_words_freq[word] == 0:
        test_only_words.append(word)

print("Test only words: ", len(test_only_words))

Test only words:  5810


In [15]:
test_train_set = {}

for i in range(1,11):
    test_train_set[i] = []
    for word in train_rare_words[i]:
        if test_words_freq[word] >= 1:
            test_train_set[i].append(word)
        
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [16]:
pickle.dump(test_train_set, open("data/test_train_set.p", "wb"))

In [10]:
test_train_set = pickle.load(open("data/test_train_set.p", "rb"))

In [17]:
for i in range(1,11):      
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [18]:
index_test_train = {}

for j in range(1,11):
    index_test_train[j] = {}
    for word in test_train_set[j]:
        index_test_train[j][word] = []

# For each Important List create Index List for each Word

In [19]:
for i in tqdm(range(len(train_doc_trf_loaded))):
    word_tokens = [token.text for token in train_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]
    for j in range(1,11):
        for word in test_train_set[j]:
            if word in word_tokens:
                index_test_train[j][word].append(i)

100%|██████████| 246525/246525 [06:48<00:00, 603.30it/s]


In [23]:
pickle.dump(index_test_train, open("data/index_test_train.p", "wb"))

In [5]:
index_test_train = pickle.load(open("data/index_test_train.p", "rb"))

In [53]:
for i in range(1,11):
    print(f'List {i} Word count: {len(index_test_train[i])}')

List 1 Word count: 3235
List 2 Word count: 2239
List 3 Word count: 1538
List 4 Word count: 1280
List 5 Word count: 1063
List 6 Word count: 912
List 7 Word count: 766
List 8 Word count: 647
List 9 Word count: 638
List 10 Word count: 549


# Init with 1 word list

In [87]:
blacklist = list(index_test_train[1].values())
blacklist = [item for sublist in blacklist for item in sublist]

In [100]:
print(len(blacklist))

3235


In [71]:
print(blacklist)

[41437, 155659, 104391, 61688, 148480, 59990, 137696, 44791, 44981, 11155, 216326, 147975, 27166, 188249, 224758, 49962, 23986, 50051, 159660, 14120, 130359, 22752, 100626, 115285, 83879, 222582, 238845, 162462, 7011, 106070, 6349, 216487, 173455, 54706, 139516, 203279, 185470, 21254, 185972, 90953, 230819, 208667, 191554, 163294, 188007, 3083, 39026, 45871, 5608, 67945, 217186, 116339, 147722, 160246, 197032, 18501, 93199, 22299, 246038, 203113, 80402, 85230, 22498, 17189, 201318, 101390, 144862, 56055, 191968, 130345, 63636, 113927, 93189, 10567, 139497, 33581, 129145, 141072, 227913, 165551, 86440, 126326, 144917, 228967, 11909, 75360, 178716, 74073, 211407, 126670, 209743, 106867, 216454, 17441, 220554, 8545, 173833, 33542, 191838, 42146, 225160, 12766, 78243, 20645, 133801, 123048, 10559, 36977, 9179, 57687, 21996, 169026, 26288, 7284, 209709, 68875, 178137, 79988, 63323, 14968, 68968, 100549, 222840, 100851, 199300, 228857, 81575, 60868, 189851, 6021, 146639, 209259, 67946, 96560

# Create blacklists for each important list while observing the other lists

In [117]:
import copy

importantLists = [2,3,4,5,10]
blackListNew = {}

useBlacklist = copy.deepcopy(blacklist)

for list_number in importantLists:
    print(f'List {list_number} word count: {len(index_test_train[list_number])}')
    
    deleted_List = {}
    blackListNew[list_number] = []
    
    for j in range(1, list_number + 1):
        deleted_List[j] = [] 

    for word in index_test_train[list_number]:
        deleted_counter = 0
        
        index_list_counter = {}
        
        for index in index_test_train[list_number][word]:
            index_list_counter[index] = 0
            
            if index in useBlacklist:
                deleted_counter = deleted_counter + 1
            
            for checkList in importantLists:
                for checkWord in index_test_train[checkList]:
                    for checkIndex in index_test_train[checkList][checkWord]:
                        if(checkIndex == index):
                            index_list_counter[index] = index_list_counter[index] +1
            
        for j in range(1,list_number+1):
            if deleted_counter == j:
                deleted_List[j].append(word)
                
        if deleted_counter == 0:
            indexToDelete = index
            for potentialIndex in index_list_counter:
                if(index_list_counter[potentialIndex] <= index_list_counter[indexToDelete]):
                    indexToDelete = potentialIndex
                
            blackListNew[list_number].append(indexToDelete)
            useBlacklist.append(indexToDelete)
    
    deletedMore = 0
    for j in range(2, list_number+1):
        deletedMore = deletedMore + len(deleted_List[j])
    
    for j in range(1, list_number+1):
        print(f'{j} count: {len(deleted_List[j])}')
    
    print(f'New blacklist length: {len(blackListNew[list_number])}')
    print(f'Deleted once        : {len(deleted_List[1])}')
    print(f'Deleted more        : {deletedMore}')
    print(f'Sum                 : {len(blackListNew[list_number]) + len(deleted_List[1]) + deletedMore}')
    print("")

List 2 word count: 2239
1 count: 80
2 count: 1
New blacklist length: 2158
Deleted once        : 80
Deleted more        : 1
Sum                 : 2239

List 3 word count: 1538
1 count: 86
2 count: 2
3 count: 0
New blacklist length: 1450
Deleted once        : 86
Deleted more        : 2
Sum                 : 1538

List 4 word count: 1280
1 count: 71
2 count: 4
3 count: 0
4 count: 0
New blacklist length: 1205
Deleted once        : 71
Deleted more        : 4
Sum                 : 1280

List 5 word count: 1063
1 count: 93
2 count: 6
3 count: 0
4 count: 0
5 count: 0
New blacklist length: 964
Deleted once        : 93
Deleted more        : 6
Sum                 : 1063

List 10 word count: 549
1 count: 68
2 count: 5
3 count: 0
4 count: 0
5 count: 0
6 count: 0
7 count: 0
8 count: 0
9 count: 0
10 count: 0
New blacklist length: 476
Deleted once        : 68
Deleted more        : 5
Sum                 : 549



# Punch all Blacklists together

In [118]:
completeBlacklist = copy.deepcopy(blacklist)

for bList in importantLists:
    print(f'Blacklist {bList} length: {len(blackListNew[bList])}')
    completeBlacklist += blackListNew[bList]

print("complete length: ", len(completeBlacklist))

Blacklist 2 length: 2158
Blacklist 3 length: 1450
Blacklist 4 length: 1205
Blacklist 5 length: 964
Blacklist 10 length: 476
complete length:  9488


# Evaluate complete Blacklist

In [121]:
useBlacklist = copy.deepcopy(completeBlacklist)

for list_number in importantLists:
    print(f'List {list_number} word count: {len(index_test_train[list_number])}')
    
    deleted_List = {}
    
    for j in range(1, list_number + 1):
        deleted_List[j] = [] 

    for word in index_test_train[list_number]:
        deleted_counter = 0
        
        index_list_counter = {}
        
        for index in index_test_train[list_number][word]:
            index_list_counter[index] = 0
            
            if index in useBlacklist:
                deleted_counter = deleted_counter + 1
            
        for j in range(1,list_number+1):
            if deleted_counter == j:
                deleted_List[j].append(word)                
    
    deletedMore = 0
    for j in range(2, list_number+1):
        deletedMore = deletedMore + len(deleted_List[j])
    
    for j in range(1, 3):
        print(f'{j} count: {len(deleted_List[j])}')
    
    print(f'Deleted once : {len(deleted_List[1])}')
    print(f'Deleted more : {deletedMore}')
    print(f'%            : {deletedMore / len(deleted_List[1])}')
    print(f'Sum          : {len(deleted_List[1]) + deletedMore}')
    print("")

List 2 word count: 2239
1 count: 2233
2 count: 6
Deleted once : 2233
Deleted more : 6
%            : 0.0026869682042095834
Sum          : 2239

List 3 word count: 1538
1 count: 1536
2 count: 2
Deleted once : 1536
Deleted more : 2
%            : 0.0013020833333333333
Sum          : 1538

List 4 word count: 1280
1 count: 1276
2 count: 4
Deleted once : 1276
Deleted more : 4
%            : 0.003134796238244514
Sum          : 1280

List 5 word count: 1063
1 count: 1057
2 count: 6
Deleted once : 1057
Deleted more : 6
%            : 0.005676442762535478
Sum          : 1063

List 10 word count: 549
1 count: 544
2 count: 5
Deleted once : 544
Deleted more : 5
%            : 0.009191176470588236
Sum          : 549



In [122]:
pickle.dump(completeBlacklist, open("data/completeBlacklist.p", "wb"))

In [123]:
print(len(completeBlacklist))

9488
