In [2]:
### Interactive medword testing tool ###

import json
import preprocess as pp
import embedding
import model_validation as mv
import shared.load_config 

# Load configuration (don't change)
config = shared.load_config.Configuration('configuration.json')


In [3]:
### Make your own settings ###

# where to get the raw *txt files for preprocessing
raw_data_dir = 'dev_data/train_data/raw_data/'

# where to store preprocessing output
train_data_src = 'dev_data/train_data/train.txt'

# where to store trained embedding model
emb_model_src = 'dev_data/embeddings/emb_model.bin'


In [4]:
### choose your tokenizer for the prepocessing and validation ###

## NonStemmingTokenizer: 'nst'
# - no stemming, only remove punctuation marks
# - lowercase letters
config.config['tokenizer'] = 'nst'

## SimpleGermanTokenizer: 'sgt'
# - remove punctuation marks
# - stemming
# - lowercase letters
# config.config['tokenizer'] = 'sgt'

# test the chosen tokenizer
tokenizer = pp.get_tokenizer(config)
example = """!! „asdf“ 'asdf' "asdf" ??Ein Arzt bzw. eine Ärztin ist ein medizinisch\n \n 
             ausgebildeter Heilkundiger, beschäftigt sich mit der
             Vorbeugung (Prävention), [Erkennung] (Diagnose),
             Behandlung (Therapie) [...] und Nachsorge von Krankheiten
             und Leiden bzw. gesundheitlichen Beeinträchtigungen,
             und stellt sich damit in den Dienst der Gesundheit."""
             
tk_example = tokenizer.tokenize(str(example))

print(tk_example)

['asdf', 'asdf', 'asdf', 'ein', 'arzt', 'bzw', 'eine', 'aerztin', 'ist', 'ein', 'medizinisch', 'ausgebildeter', 'heilkundiger', 'beschaeftigt', 'sich', 'mit', 'der', 'vorbeugung', 'praevention', 'erkennung', 'diagnose', 'behandlung', 'therapie', 'und', 'nachsorge', 'von', 'krankheiten', 'und', 'leiden', 'bzw', 'gesundheitlichen', 'beeintraechtigungen', 'und', 'stellt', 'sich', 'damit', 'in', 'den', 'dienst', 'der', 'gesundheit']


In [5]:
### compute new training data from raw *txt files ###

# note: if you don't compute new training data, you must provide them
#       at train_data_src for training a new model
COMPUTE_NEW_TRAIN_DATA = True

# compute new train data if needed
if (COMPUTE_NEW_TRAIN_DATA):
    pp.create_train_data(train_data_src, raw_data_dir, config)


Creating new training data. 
Using this Tokenizer:  NonStemmingTokenizer
Making tokenSet from directory ' dev_data/train_data/raw_data/ '
Found 6165 different tokens in 32 articles, total training size: 17391 tokens.


In [6]:
### train a new model on your training data ###

# note: if you don't train a new model, you must provide it
#       at emb_model_src for validation
TRAIN_NEW_MODEL = True

# dimension of embedding vectors
config.config['embedding_vector_dim'] = 400

# minimum number of appearence fore a token to be considered in the 
# model (you may want to ignore rare tokens)
config.config['min_token_appearance'] = 1


# train embeddings using word2vec
if (TRAIN_NEW_MODEL):
    embedding.make_emb_from_file(train_data_src, emb_model_src, config)



Start training the model.
Training finsihed. 
Model saved at: dev_data/embeddings/emb_model.bin


In [8]:
### Validation of the embedding model ###

# Choose test files from your validation folder
config.config['doesntfit_file'] = 'german_doesntfit1.txt'
config.config['synonyms_file'] = 'german_synonyms3.txt'


model = mv.validate_model(emb_model_src, config)

Start validation. Loading model. 

Validating 'doesntfit' with file data/validation_data/german_doesntfit1.txt

*** Doesn't fit ***
Doesn't fit correct:  18.2% (2/11)
Doesn't fit coverage: 10.0% (11/110) 

Validating 'synonyms' with file data/validation_data/german_synonyms3.txt
Synonyms: 42175 pairs in input. 31 pairs in model-vocabulary.

*** Cosine-Similarity ***
Synonyms avg-cos-similarity (SACS): 0.0410093004673 
Random avg-cos-similarity (RACS): 0.000848137110041 
Ratio SACS/RACS: 48.3522062433

*** Synonym Recognition ***
Synonyms correct:  1.6% (1/62), checked 40 closest embedding-vectors.checked per word.
Synonyms coverage: 0.1% (62/84350)



In [9]:
### Get model information ###
print("vocab-size:", model.vocab.size)  # note that the token '</s>' is 
                                        # always added to the model



vocab-size: 6166
