# Load Spacy

In [1]:
import spacy
import pickle
import copy
import numpy as np
from tqdm import tqdm
from collections import Counter

#spacy.prefer_gpu()
nlp = spacy.load("de_dep_news_trf")
nlp.max_length = 17000000

# Load pre-processed Datasets

In [None]:
train_doc_trf_loaded = pickle.load(open("data_w/train_doc_trf.p", "rb"))

In [6]:
val_doc_trf_loaded = pickle.load(open("data/val_doc_trf.pickle", "rb"))
test_doc_trf_loaded = pickle.load(open("data/test_doc_trf.pickle", "rb"))

In [7]:
print("Train: ", len(train_doc_trf_loaded))
print("Val:   ", len(val_doc_trf_loaded))
print("Test:  ", len(test_doc_trf_loaded))

Train:  246525
Val:    15588
Test:   15588


## Load tokenized Datasets

In [8]:
# TRAIN

train_tok_text = pickle.load(open("data/train_tok_text.p", "rb"))
train_tok_lemma = pickle.load(open("data/train_tok_lemma.p", "rb"))
train_nouns_text = pickle.load(open("data/train_nouns_text.p", "rb"))
train_nouns_lemma = pickle.load(open("data/train_nouns_lemma.p", "rb"))

# Analyze train Data

In [9]:
print("train words: ", len(train_tok_text))
train_words_freq = Counter(train_tok_text)
print("train words dict: ", len(train_words_freq))

train words:  1138469
train words dict:  167707


In [10]:
train_words_freq_keys = train_words_freq.keys()

train_rare_words = {}

for i in range(1,11):
    train_rare_words[i] = []
    for word in train_words_freq_keys:
        
        if train_words_freq[word] == i:
            train_rare_words[i].append(word)
        
    print("Number of rare words " + str(i) + " : ", len(train_rare_words[i]))

Number of rare words 1 :  91121
Number of rare words 2 :  27144
Number of rare words 3 :  12429
Number of rare words 4 :  7111
Number of rare words 5 :  4604
Number of rare words 6 :  3234
Number of rare words 7 :  2433
Number of rare words 8 :  1914
Number of rare words 9 :  1544
Number of rare words 10 :  1284


In [11]:
pickle.dump(train_rare_words, open("data/train_rare_words.p", "wb"))

In [None]:
train_rare_words = pickle.load(open("data/train_rare_words.p", "rb"))

# Analyse Test Data

In [12]:
test_tok_text = []
for i in tqdm(range(len(test_doc_trf_loaded))):
    test_tok_text = test_tok_text + [token.text for token in test_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]

100%|██████████| 15588/15588 [00:04<00:00, 3574.56it/s]


In [13]:
print("Test words: ", len(test_tok_text))
test_words_freq = Counter(test_tok_text)
print("Test words dict: ", len(test_words_freq))

Test words:  67545
Test words dict:  29553


In [14]:
test_words_freq_keys = test_words_freq.keys()

test_only_words = []

for word in test_words_freq_keys:
    if train_words_freq[word] == 0:
        test_only_words.append(word)

print("Test only words: ", len(test_only_words))

Test only words:  5810


In [15]:
test_train_set = {}

for i in range(1,11):
    test_train_set[i] = []
    for word in train_rare_words[i]:
        if test_words_freq[word] >= 1:
            test_train_set[i].append(word)
        
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [16]:
pickle.dump(test_train_set, open("data/test_train_set.p", "wb"))

In [2]:
test_train_set = pickle.load(open("data/test_train_set.p", "rb"))

In [4]:
for i in range(1,11):      
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [18]:
index_test_train = {}

for j in range(1,11):
    index_test_train[j] = {}
    for word in test_train_set[j]:
        index_test_train[j][word] = []

# For each Important List create Index List for each Word

In [19]:
for i in tqdm(range(len(train_doc_trf_loaded))):
    word_tokens = [token.text for token in train_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]
    for j in range(1,11):
        for word in test_train_set[j]:
            if word in word_tokens:
                index_test_train[j][word].append(i)

100%|██████████| 246525/246525 [06:48<00:00, 603.30it/s]


In [23]:
pickle.dump(index_test_train, open("data/index_test_train.p", "wb"))

In [5]:
index_test_train = pickle.load(open("data/index_test_train.p", "rb"))

In [6]:
for i in range(1,11):
    print(f'List {i} Word count: {len(index_test_train[i])}')

List 1 Word count: 3235
List 2 Word count: 2239
List 3 Word count: 1538
List 4 Word count: 1280
List 5 Word count: 1063
List 6 Word count: 912
List 7 Word count: 766
List 8 Word count: 647
List 9 Word count: 638
List 10 Word count: 549


# Init with 1 word list and append directly 2

In [7]:
blacklist = list(index_test_train[1].values())
blacklist = [item for sublist in blacklist for item in sublist]

In [8]:
print(len(blacklist))

3235


In [9]:
# cleaned initial list
cinit = list(set(blacklist))
print("Cleaned complete length: ", len(cinit))

Cleaned complete length:  3193


In [10]:
print(blacklist)

[158, 215, 226, 350, 381, 434, 450, 533, 597, 613, 644, 690, 818, 862, 996, 1043, 1081, 1182, 1238, 1264, 1310, 1358, 1390, 1504, 1521, 1530, 1566, 1587, 1596, 1633, 1822, 1916, 1936, 1954, 1967, 1967, 1995, 2052, 2172, 2204, 2235, 2248, 2258, 2351, 2429, 2429, 2442, 2481, 2585, 2689, 2691, 2861, 2910, 2946, 3287, 3344, 3346, 3431, 3701, 3746, 3777, 3874, 4152, 4198, 4234, 4366, 4409, 4415, 4513, 4603, 4641, 4701, 4767, 4894, 5050, 5053, 5322, 5410, 5456, 5474, 5479, 5498, 5519, 5696, 5767, 5772, 5826, 5831, 5905, 5919, 5957, 6005, 6082, 6247, 6316, 6493, 6502, 6512, 6553, 6640, 6643, 6655, 6694, 6709, 6709, 6773, 6845, 6861, 6926, 6934, 6983, 7000, 7030, 7030, 7102, 7299, 7303, 7361, 7375, 7466, 7625, 7703, 7712, 7763, 7766, 7951, 7987, 8099, 8166, 8221, 8348, 8376, 8378, 8443, 8476, 8483, 8489, 8567, 8676, 8770, 8795, 8816, 8900, 8971, 9048, 9059, 9070, 9072, 9094, 9137, 9235, 9298, 9589, 9596, 9665, 9718, 9784, 9983, 10114, 10146, 10192, 10249, 10249, 10249, 10258, 10311, 10311, 104

In [16]:
blacklist_2 = list(index_test_train[2].values())
blacklist_2 = [item for sublist in blacklist_2 for item in sublist]
#list(index_test_train[2].values())
#print(blacklist_2)
blacklist_1_2 = blacklist + blacklist_2

print(len(blacklist))
print(len(blacklist_2))
print(len(blacklist_1_2))

blacklist_1_2 = list(set(blacklist_1_2))
print("Cleaned complete length: ", len(blacklist_1_2))

3235
4460
7695
Cleaned complete length:  7529


In [20]:
test_counter = {}
test_counter[444] = 0
test_counter[222] = 0
test_counter[777] = 0
test_counter[333] = 0

test_counter[444] = test_counter[444] +1
test_counter[444] = test_counter[444] +1
test_counter[444] = test_counter[444] +1

test_counter[222] = test_counter[222] +1
test_counter[222] = test_counter[222] +1

test_counter[777] = test_counter[777] +1
test_counter[777] = test_counter[777] +1
test_counter[777] = test_counter[777] +1

test_counter[333] = test_counter[333] +1

print(test_counter)

test_counter = dict(sorted(test_counter.items(), key=lambda item: item[1]))

print(test_counter)
print(list(test_counter.keys())[0])
print(list(test_counter.keys())[1])

{444: 3, 222: 2, 777: 3, 333: 1}
{333: 1, 222: 2, 444: 3, 777: 3}
333
222


# Create blacklists for each important list while observing the other lists

In [35]:
importantLists = [3,4,5,10]
blackListNew = {}

useBlacklist = copy.deepcopy(blacklist_1_2)

for list_number in importantLists:
    print(f'List {list_number} word count: {len(index_test_train[list_number])}')
    
    deleted_List = {}
    blackListNew[list_number] = []
    
    for j in range(1, list_number + 1):
        deleted_List[j] = [] 

    for word in index_test_train[list_number]:
        deleted_counter = 0
        
        index_list_counter = {}
        deleted_index_list = []
        
        for index in index_test_train[list_number][word]:
            index_list_counter[index] = 0
            
            if index in useBlacklist:
                deleted_counter = deleted_counter + 1
                deleted_index_list.append(index)
            
            for checkList in importantLists:
                for checkWord in index_test_train[checkList]:
                    for checkIndex in index_test_train[checkList][checkWord]:
                        if(checkIndex == index):
                            index_list_counter[index] = index_list_counter[index] +1
            
        for j in range(1,list_number+1):
            if deleted_counter == j:
                deleted_List[j].append(word)
        
        index_list_counter = dict(sorted(index_list_counter.items(), key=lambda item: item[1]))
        
        if deleted_counter == 0:            
            del_1 = list(index_list_counter.keys())[0]
            del_2 = list(index_list_counter.keys())[1]
            
            blackListNew[list_number].append(del_1)
            useBlacklist.append(del_1)
            blackListNew[list_number].append(del_2)
            useBlacklist.append(del_2)
        
        if deleted_counter == 1:     
            deleteIndex = list(index_list_counter.keys())[0]
            
            if deleteIndex in deleted_index_list:
                deleteIndex = list(index_list_counter.keys())[1]

            blackListNew[list_number].append(deleteIndex)
            useBlacklist.append(deleteIndex)
    
    deletedMore = 0
    for j in range(2, list_number+1):
        deletedMore = deletedMore + len(deleted_List[j])
    
    for j in range(1, list_number+1):
        print(f'{j} count: {len(deleted_List[j])}')
    
    print(f'New blacklist length: {len(blackListNew[list_number])}')
    print(f'Deleted once        : {len(deleted_List[1])}')
    print(f'Deleted more        : {deletedMore}')
    print(f'Sum                 : {len(blackListNew[list_number]) + len(deleted_List[1]) + deletedMore}')
    print(f'Use Blacklist lenght: {len(useBlacklist)}')
    print("")

List 3 word count: 1538
1 count: 169
2 count: 12
3 count: 0
New blacklist length: 2883
Deleted once        : 169
Deleted more        : 12
Sum                 : 3064
Use Blacklist lenght: 10412

List 4 word count: 1280
1 count: 158
2 count: 15
3 count: 1
4 count: 0
New blacklist length: 2370
Deleted once        : 158
Deleted more        : 16
Sum                 : 2544
Use Blacklist lenght: 12782

List 5 word count: 1063
1 count: 176
2 count: 20
3 count: 2
4 count: 0
5 count: 0
New blacklist length: 1906
Deleted once        : 176
Deleted more        : 22
Sum                 : 2104
Use Blacklist lenght: 14688

List 10 word count: 549
1 count: 130
2 count: 30
3 count: 1
4 count: 0
5 count: 0
6 count: 0
7 count: 0
8 count: 0
9 count: 0
10 count: 0
New blacklist length: 906
Deleted once        : 130
Deleted more        : 31
Sum                 : 1067
Use Blacklist lenght: 15594



# Punch all Blacklists together

In [55]:
completeBlacklist = copy.deepcopy(blacklist)

for bList in importantLists:
    print(f'Blacklist {bList} length: {len(blackListNew[bList])}')
    completeBlacklist += blackListNew[bList]

print("complete length: ", len(completeBlacklist))
print(f'Use Blacklist lenght: {len(useBlacklist)}')

Blacklist 2 length: 2158
Blacklist 3 length: 1450
Blacklist 4 length: 1205
Blacklist 5 length: 964
Blacklist 10 length: 476
complete length:  9488
Use Blacklist lenght: 9488


In [37]:
# remove duplicates
cleanedCompleteBlacklist = list(set(useBlacklist))

# sort list
completeBlacklist_sorted = copy.deepcopy(cleanedCompleteBlacklist)
completeBlacklist_sorted.sort()

print("Cleaned complete length: ", len(completeBlacklist_sorted))


Cleaned complete length:  15594


# Evaluate complete Blacklist

In [36]:
from collections import Counter

importantLists = [1,2,3,4,5,10]

useBlacklist = copy.deepcopy(useBlacklist)

for list_number in importantLists:
    print(f'List {list_number} word count: {len(index_test_train[list_number])}')
    
    deleted_List = {}
    
    for j in range(1, list_number + 1):
        deleted_List[j] = [] 

    for word in index_test_train[list_number]:
        deleted_counter = 0
                
        for index in index_test_train[list_number][word]:         
            if index in useBlacklist:
                deleted_counter = deleted_counter + 1
            
        for j in range(1,list_number+1):
            if deleted_counter == j:
                deleted_List[j].append(word)                
    
    deletedMore = 0
    for j in range(2, list_number+1):
        deletedMore = deletedMore + len(deleted_List[j])
    
    for j in range(1, len(deleted_List)+1):
        print(f'{j} count: {len(deleted_List[j])}')
    
    
    if list_number != 1:
        print(f'Deleted twice : {len(deleted_List[2])}')
        print(f'%            : {len(deleted_List[2]) / len(index_test_train[list_number])}')
        print(f'Sum          : {len(deleted_List[1]) + deletedMore}')
    print("")

List 1 word count: 3235
1 count: 3235

List 2 word count: 2239
1 count: 18
2 count: 2221
Deleted twice : 2221
%            : 0.991960696739616
Sum          : 2239

List 3 word count: 1538
1 count: 0
2 count: 1536
3 count: 2
Deleted twice : 1536
%            : 0.9986996098829649
Sum          : 1538

List 4 word count: 1280
1 count: 0
2 count: 1279
3 count: 1
4 count: 0
Deleted twice : 1279
%            : 0.99921875
Sum          : 1280

List 5 word count: 1063
1 count: 0
2 count: 1061
3 count: 2
4 count: 0
5 count: 0
Deleted twice : 1061
%            : 0.9981185324553151
Sum          : 1063

List 10 word count: 549
1 count: 0
2 count: 548
3 count: 1
4 count: 0
5 count: 0
6 count: 0
7 count: 0
8 count: 0
9 count: 0
10 count: 0
Deleted twice : 548
%            : 0.9981785063752276
Sum          : 549



In [38]:
pickle.dump(completeBlacklist_sorted, open("data/completeBlacklist_2.p", "wb"))

In [39]:
completeBlacklist_2 = pickle.load(open("data/completeBlacklist_2.p", "rb"))

In [40]:
print(len(completeBlacklist_2))

15594


In [42]:
print(completeBlacklist_2)

[7, 14, 18, 20, 28, 31, 35, 52, 58, 81, 82, 83, 84, 114, 152, 158, 173, 179, 183, 186, 195, 201, 206, 212, 215, 226, 231, 236, 238, 241, 245, 252, 260, 261, 268, 269, 283, 290, 293, 302, 305, 317, 326, 338, 346, 350, 351, 361, 381, 394, 395, 396, 403, 418, 422, 423, 434, 436, 437, 441, 444, 450, 454, 458, 486, 521, 531, 533, 536, 543, 551, 554, 558, 561, 562, 563, 586, 587, 595, 597, 608, 612, 613, 615, 624, 632, 644, 646, 659, 669, 671, 677, 684, 690, 696, 721, 728, 737, 750, 754, 759, 760, 776, 781, 793, 807, 812, 813, 818, 826, 831, 833, 840, 858, 862, 870, 894, 921, 923, 927, 930, 956, 965, 970, 978, 981, 982, 993, 996, 1009, 1015, 1042, 1043, 1081, 1089, 1099, 1108, 1116, 1130, 1165, 1173, 1182, 1223, 1238, 1247, 1264, 1267, 1286, 1297, 1310, 1313, 1317, 1319, 1322, 1327, 1343, 1346, 1351, 1355, 1358, 1378, 1383, 1390, 1396, 1401, 1420, 1428, 1437, 1439, 1445, 1463, 1467, 1473, 1474, 1482, 1504, 1521, 1523, 1530, 1532, 1533, 1540, 1557, 1558, 1566, 1587, 1591, 1596, 1602, 1607, 16

In [43]:
print(index_test_train[1])

{'tauernbach': [158], 'korrektes': [215], 'böhlau': [226], 'reichhaltiger': [350], 'abad': [381], 'trajektorie': [434], 'ertrage': [450], 'nachgedreht': [533], 'schneebericht': [597], 'gyptenrundfahrt': [613], 'whitefield': [644], 'lößboden': [690], 'oberhaunstadt': [818], 'lahmgelegt': [862], 'adsorption': [996], 'sitzheizung': [1043], 'zitty': [1081], 'vereinsarbeit': [1182], 'scopes': [1238], 'ate': [1264], 'orthopäden': [1310], 'legacy': [1358], 'rümpfe': [1390], 'felmy': [1504], 'scheußlich': [1521], 'musikkorps': [1530], 'lohnquote': [1566], 'störenfriede': [1587], 'derlei': [1596], 'herumgesprochen': [1633], 'kapuze': [1822], 'kampfflugzeugen': [1916], 'redebeiträgen': [1936], 'bruchlandung': [1954], 'finanzgerichtsbarkeit': [1967], 'zweistufig': [1967], 'proprietärer': [1995], 'baubestand': [2052], 'grasbewachsene': [2172], 'anschieben': [2204], 'rollenspielen': [2235], 'eincremen': [2248], 'kommunalkredit': [2258], 'poststempel': [2351], 'einhundertfünfzig': [2429], 'stundenki

# Load real datasets

In [45]:
from datasets import load_from_disk

test_sampled_pro = load_from_disk("E:/Master/data/test_sampled_pro")
train_sampled_pro = load_from_disk("E:/Master/data/train_sampled_pro")

In [46]:
train_text_raw = load_from_disk("E:/Master/data/0_text/train_text")

### Verify to delete only the relevant data

In [47]:
print(len(train_sampled_pro))
print(len(train_text_raw))
print(len(completeBlacklist_2))
print(len(completeBlacklist_2) / len(train_text_raw))

246525
246525
15594
0.06325524794645573


In [48]:
dumpList = []

for i in range(0,len(train_text_raw)):
    dumpList.append(i)
    
print(dumpList[158])
print(dumpList[215])
print(dumpList[226])

keepList = list(set(dumpList)-set(completeBlacklist_2))

print(len(keepList))

print(keepList[158])
print(keepList[215])
print(keepList[226])

158
215
226
230931
175
246
258


### 158 --> 161 ; 215 --> 219 ; 226 --> 231 works!
Try it on the huggingface text dataset

In [49]:
train_text_raw_cut_2 = train_text_raw.select(keepList)

In [52]:
print(len(train_text_raw))
print(len(completeBlacklist_2))
print(len(train_text_raw) - len(completeBlacklist_2))
print(len(train_text_raw_cut_2))

246525
15594
230931
230931


In [54]:
train_text_raw_cut_2.save_to_disk("E:/Master/data/0_text/train_text_raw_cut_2")

In [55]:
train_text_raw_cut_2_loaded = load_from_disk("E:/Master/data/0_text/train_text_raw_cut_2")

# works on the text dataset --> cut real train set

In [56]:
train_sampled_pro_cut_2 = train_sampled_pro.select(keepList)

In [57]:
train_sampled_pro_cut_2.save_to_disk("E:/Master/data/train_sampled_pro_cut_2")