# Load Spacy

In [1]:
import spacy
import pickle
import numpy as np
from tqdm import tqdm
from collections import Counter

#spacy.prefer_gpu()
nlp = spacy.load("de_dep_news_trf")
nlp.max_length = 17000000

# Load pre-processed Datasets

In [3]:
with open('data/train_doc_trf.pickle', 'rb') as handle:
    train_doc_trf_loaded = pickle.load(handle)   

In [4]:
with open('data/val_doc_trf.pickle', 'rb') as handle:
     val_doc_trf_loaded = pickle.load(handle) 

In [5]:
with open('data/test_doc_trf.pickle', 'rb') as handle:
    test_doc_trf_loaded = pickle.load(handle)

In [8]:
print("Train: ", len(train_doc_trf_loaded))
print("Val:   ", len(val_doc_trf_loaded))
print("Test:  ", len(test_doc_trf_loaded))

Train:  246525
Val:    15588
Test:   15588


## Load tokenized Datasets

In [9]:
# TRAIN

with open('data/train_tok_text.pickle', 'rb') as handle:
    train_tok_text = pickle.load(handle)
    
with open('data/train_tok_lemma.pickle', 'rb') as handle:
    train_tok_lemma = pickle.load(handle)

with open('data/train_nouns_text.pickle', 'rb') as handle:
    train_nouns_text = pickle.load(handle)

with open('data/train_nouns_lemma.pickle', 'rb') as handle:
    train_nouns_lemma = pickle.load(handle)

# Analyze train Data

In [10]:
print("train words: ", len(train_tok_text))
train_words_freq = Counter(train_tok_text)
print("train words dict: ", len(train_words_freq))

train words:  1138469
train words dict:  167707


In [17]:
train_words_freq_keys = train_words_freq.keys()

train_rare_words = {}

for i in range(1,11):
    train_rare_words[i] = []
    for word in train_words_freq_keys:
        
        if train_words_freq[word] == i:
            train_rare_words[i].append(word)
        
    print("Number of rare words " + str(i) + " : ", len(train_rare_words[i]))

Number of rare words 1 :  91121
Number of rare words 2 :  27144
Number of rare words 3 :  12429
Number of rare words 4 :  7111
Number of rare words 5 :  4604
Number of rare words 6 :  3234
Number of rare words 7 :  2433
Number of rare words 8 :  1914
Number of rare words 9 :  1544
Number of rare words 10 :  1284


In [18]:
pickle.dump(train_rare_words, open("data/train_rare_words.p", "wb"))

In [None]:
train_rare_words = pickle.load(open("data/train_rare_words.p", "rb"))

# Analyse Test Data

In [12]:
test_tok_text = []
for i in tqdm(range(len(test_doc_trf_loaded))):
    test_tok_text = test_tok_text + [token.text for token in test_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]

100%|██████████| 15588/15588 [00:03<00:00, 4367.13it/s]


In [13]:
print("Test words: ", len(test_tok_text))
test_words_freq = Counter(test_tok_text)
print("Test words dict: ", len(test_words_freq))

Test words:  67545
Test words dict:  29553


In [19]:
test_words_freq_keys = test_words_freq.keys()

test_only_words = []

for word in test_words_freq_keys:
    if train_words_freq[word] == 0:
        test_only_words.append(word)

print("Test only words: ", len(test_only_words))

Test only words:  5810


In [20]:
test_train_set = {}

for i in range(1,11):
    test_train_set[i] = []
    for word in train_rare_words[i]:
        if test_words_freq[word] >= 1:
            test_train_set[i].append(word)
        
    print("Number of words " + str(i) + " : ", len(test_train_set[i]))

Number of words 1 :  3235
Number of words 2 :  2239
Number of words 3 :  1538
Number of words 4 :  1280
Number of words 5 :  1063
Number of words 6 :  912
Number of words 7 :  766
Number of words 8 :  647
Number of words 9 :  638
Number of words 10 :  549


In [21]:
pickle.dump(train_rare_words, open("data/test_train_set.p", "wb"))

In [33]:
index_test_train = {}

for j in range(1,11):
    index_test_train[j] = {}
    for word in test_train_set[j]:
        index_test_train[j][word] = []

In [34]:
for i in tqdm(range(len(train_doc_trf_loaded))):
    word_tokens = [token.text for token in train_doc_trf_loaded[i] if(not token.is_stop and not token.is_punct)]
    for j in range(1,11):
        for word in test_train_set[j]:
            if word in word_tokens:
                index_test_train[j][word].append(i)

100%|██████████| 246525/246525 [05:53<00:00, 697.71it/s]


In [41]:
pickle.dump(train_rare_words, open("data/index_test_train.p", "wb"))

In [40]:
print(len(index_test_train[10]))

549
