In [None]:
### Interactive medword testing tool ###

import json
import preprocess as pp
import embedding
import model_validation as mv
import shared.load_config 
import os


In [None]:
### script settings ###

# if you want to compute new training data from raw-data
COMPUTE_NEW_TRAIN_DATA = True
# note: if you don't compute new training data, you must provide them
#       at train_data_src for training a new model


# if you want to train a new model on training data
TRAIN_NEW_MODEL = True
# note: if you don't train a new model, you must provide it
#       at emb_model_src for the validation step


In [None]:
### define filepaths and names ###
#
# Note: when generating training data, all *.txt files in any subdirectory of 
#       raw_data_dir are considered. If you save the generated train_data in the
#       same folder (raw_data_dir), it will be used as raw_data input in a 
#       following run (undesired).

# where to get the configuration file
config_data_dir = ''
config_filename = 'configuration.json'

# where to get the raw *txt files for preprocessing
raw_data_dir = 'dev_data/train_data/raw_data/'

# where to store preprocessing output
train_data_src = 'dev_data/train_data/train.txt'

# where to store trained embedding model
emb_model_dir = 'dev_data/embeddings/'
emb_model_filename = 'emb_model.bin'
emb_model_src = os.path.join(emb_model_dir, emb_model_filename)

# Load configuration (don't change)
config_src = os.path.join(config_data_dir, config_filename)
config = shared.load_config.Configuration(config_src)


In [None]:
### choose your tokenizer for the prepocessing and validation ###

## NonStemmingTokenizer: 'nst'
# - no stemming, only remove punctuation marks
# - lowercase letters
config.config['tokenizer'] = 'nst'

## SimpleGermanTokenizer: 'sgt'
# - remove punctuation marks
# - stemming
# - lowercase letters
# config.config['tokenizer'] = 'sgt'

# test the chosen tokenizer
tokenizer = pp.get_tokenizer(config)
example = """!! „asdf“ 'asdf' "asdf" ??Ein Arzt bzw. eine Ärztin ist ein medizinisch\n \n 
             ausgebildeter Heilkundiger, beschäftigt sich mit der
             Vorbeugung (Prävention), [Erkennung] (Diagnose),
             Behandlung (Therapie) [...] und Nachsorge von Krankheiten
             und Leiden bzw. gesundheitlichen Beeinträchtigungen,
             und stellt sich damit in den Dienst der Gesundheit."""
             
tk_example = tokenizer.tokenize(str(example))

print(tk_example)

In [None]:
### compute new training data from raw *txt files ###

# compute new train data if needed
if (COMPUTE_NEW_TRAIN_DATA):
    pp.create_train_data(train_data_src, raw_data_dir, config)


In [None]:
### train a new model on your training data ###
# note: the used configuration file is saved together with the model
#       in the emb_model_dir
#

# dimension of embedding vectors
config.config['embedding_vector_dim'] = 400

# minimum number of appearence fore a token to be considered in the 
# model (you may want to ignore rare tokens)
config.config['min_token_appearance'] = 1


# train embeddings using word2vec
if (TRAIN_NEW_MODEL):
    embedding.make_emb_from_file(train_data_src, emb_model_dir, emb_model_filename, config)



In [None]:
### Validation of the embedding model ###

# Choose test files from your validation folder
config.config['doesntfit_file'] = 'german_doesntfit1.txt'
config.config['synonyms_file'] = 'german_synonyms3.txt'


model = mv.validate_model(emb_model_src, config)

In [None]:
### Get model information ###
print("vocab-size:", model.vocab.size)  # note that the token '</s>' is 
                                        # always added to the model



In [None]:
%tb 