## Medtext Pipeline

Assembling, parsing, crawling of all components to build the medtext databases. Further information can be found in the wiki: http://wiki.eonum.ch/doku.php?id=data:medicaltext




In [24]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import load_config
import json

### define a new configuration
config_dict = {
    # databases
    "medical_books_pdf_folder": "/media/data/medical_books/medical_books_pdf",
    "medical_books_txt_folder": "/media/data/medical_books/medical_books_plaintxt/",
    "wiki_dump_url": "https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2",
    "wiki_dump": "/media/data/wiki_dumps/dewiki-latest-pages-articles.xml.bz2",
    "wiki_dump_extracted": "/media/data/wiki_dumps/dewiki-latest-pages-articles",
    "wiki_medical_txt": "/media/data/wiki_dumps/wiki_med_txts",
    "wiki_non_medical_txt": "/media/data/wiki_dumps/wiki_non_med_txts",
    "crawler_output_html": "/media/data/medtextcollector/data/output/crawler/raw",
    "crawler_output_txt": "/media/data/medtextcollector/data/output/crawler/pages",
    
    # models
    "medtext_classifier_config": "",
    "embedding_model": "",
    
    # chose tokenizer. possible values: "nst" and "sgt"
    ## NonStemmingTokenizer: 'nst'
    # - no stemming, only remove punctuation marks
    # - lowercase letters

    ## SimpleGermanTokenizer: 'sgt'
    # - remove punctuation marks
    # - stemming
    # - lowercase letters
    "tokenizer": "nst",
    
    # where to store the configuration file
    "config_path": "/media/data/configuration-medtext-notebook-pipeline.json"
}


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
### save config file (don't change)
config_src = config_dict["config_path"]

with open(config_src, 'w+') as f:
    json.dump(config_dict, f, indent=4)
    
# load config object based on config file (don't change)
config = load_config.Configuration(config_src, True)

In [43]:
# sentence detection

import nltk
sentence_detector = nltk.data.load('tokenizers/punkt/german.pickle')

sentence_detector.tokenize("Dies ist ein Beispielsatz! Und hier noch einmal.\nNeuer Absatz: Aber hallo.. Du")

['Dies', 'ist', 'ein', 'Beispielsatz!']

In [None]:
# get number of files, sentences, tokens, distinct tokens in each database
from data_analysis_toolkit import get_files_from_folder, load_documents , extract_sentences

databases_raw = [config["medical_books_pdf_folder"], config["wiki_dump_extracted"], config["crawler_output_html"]]
databases_cleaned = [config["medical_books_txt_folder"], config["wiki_medical_txt"], 
                     config["wiki_non_medical_txt"], config["crawler_output_txt"]]

from IPython.display import HTML, display
import tabulate
table = [["database", "files"]]

print("Raw data bases")
for db in  databases_raw:
    table.append([db, len(get_files_from_folder(db))])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

print("Cleaned data bases")
table = [["database", "files", "sentences", "tokens", "vocabulary"]]
for db in  databases_cleaned:
    files = get_files_from_folder(db)
    documents, load_errors = load_documents(files)
    sentences = []
    for document in documents:
        sentences.extend(extract_sentences(document, sentence_detector))
    table.append([db, len(files), len(sentences)])
display(HTML(tabulate.tabulate(table, tablefmt='html')))


    

Raw data bases


0,1
database,files
/media/data/medical_books/medical_books_pdf,53869
/media/data/wiki_dumps/dewiki-latest-pages-articles,0
/media/data/medtextcollector/data/output/crawler/raw,8009


Cleaned data bases


In [None]:
# download and extract wiki dump

# TODO

In [None]:
# extract medical and non medical articles from wiki dump and convert XML to TXT

# TODO

In [None]:
# train medtext classifier

# TODO

In [None]:
# start crawler

# TODO

In [None]:
# extract TXT from PDFs

# TODO

In [None]:
# train embedding / start medword

# TODO