## Medtext Pipeline

Assembling, parsing, crawling of all components to build the medtext databases. Further information can be found in the wiki: http://wiki.eonum.ch/doku.php?id=data:medicaltext




In [24]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import load_config
import json

### define a new configuration
config_dict = {
    # databases
    "medical_books_pdf_folder": "/media/data/medical_books/medical_books_pdf",
    "medical_books_txt_folder": "/media/data/medical_books/medical_books_plaintxt/",
    "wiki_dump_url": "https://dumps.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2",
    "wiki_dump": "/media/data/wiki_dumps/dewiki-latest-pages-articles.xml.bz2",
    "wiki_dump_extracted": "/media/data/wiki_dumps/dewiki-latest-pages-articles",
    "wiki_medical_txt": "/media/data/wiki_dumps/wiki_med_txts",
    "wiki_non_medical_txt": "/media/data/wiki_dumps/wiki_non_med_txts",
    "crawler_output_html": "/media/data/medtextcollector/data/output/crawler/raw",
    "crawler_output_txt": "/media/data/medtextcollector/data/output/crawler/pages",
    
    # models
    "medtext_classifier_config": "",
    "embedding_model": "",
    
    # chose tokenizer. possible values: "nst" and "sgt"
    ## NonStemmingTokenizer: 'nst'
    # - no stemming, only remove punctuation marks
    # - lowercase letters

    ## SimpleGermanTokenizer: 'sgt'
    # - remove punctuation marks
    # - stemming
    # - lowercase letters
    "tokenizer": "nst",
    
    # where to store the configuration file
    "config_path": "/media/data/configuration-medtext-notebook-pipeline.json"
}


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
### save config file (don't change)
config_src = config_dict["config_path"]

with open(config_src, 'w+') as f:
    json.dump(config_dict, f, indent=4)
    
# load config object based on config file (don't change)
config = load_config.Configuration(config_src, True)

In [51]:
# load sentence detector and tokenizer

import nltk
from notebooks.tokenizer import get_tokenizer

sentence_detector = nltk.data.load('tokenizers/punkt/german.pickle')
tokenizer = get_tokenizer(config['tokenizer'])

sentence_detector.tokenize("Dies ist ein Beispielsatz! Und hier noch einmal.\nNeuer Absatz: Aber hallo.. Du")

tokenizer.tokenize("Dies ist ein Beispielastz!")

['dies', 'ist', 'ein', 'beispielastz']

In [55]:
# get number of files, sentences, tokens, vocabulary and cahracter histograms in each database. 
# This cell takes quite a while to execute!
from data_analysis_toolkit import get_files_from_folder, load_documents , extract_sentences

databases_raw = [config["medical_books_pdf_folder"], config["wiki_dump_extracted"], config["crawler_output_html"],
                config["wiki_non_medical_txt"], ]
databases_cleaned = [config["medical_books_txt_folder"], config["wiki_medical_txt"], 
                     config["crawler_output_txt"]]

from IPython.display import HTML, display
import tabulate
table = [["database", "files"]]

print("Raw data bases")
for db in  databases_raw:
    table.append([db, len(get_files_from_folder(db))])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

print("Cleaned data bases")
table = [["database", "files", "documents", "sentences", "tokens", "vocabulary"]]
vocs = []
for db in  databases_cleaned:
    files = get_files_from_folder(db)
    documents, load_errors = load_documents(files)
    sentences = []
    tokens = []
    for document in documents:
        sentences.extend(extract_sentences(document, sentence_detector))
    for sentence in sentences:
        tokens.extend(tokenizer.tokenize(sentence))
    voc = set(tokens)
    vocs.append(voc)
    print([db, len(files), len(documents), len(sentences), len(tokens), len(voc)])
    table.append([db, "{:,}".format(len(files)), "{:,}".format(len(documents)), 
                  "{:,}".format(len(sentences)), "{:,}".format(len(tokens)), 
                  "{:,}".format(len(voc))])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

voc_union = vocs[0]
voc_intersection = vocs[0]
for voc in vocs:
    voc_union = voc_union.union(voc)
    voc_intersection = voc_intersection.intersection(voc)

print("Union vocabulary: " + "{:,}".format(len(voc_union)))
print("Intersection vocabulary: " + "{:,}".format(len(voc_intersection)))

    

Raw data bases


0,1
database,files
/media/data/medical_books/medical_books_pdf,53869
/media/data/wiki_dumps/dewiki-latest-pages-articles,0
/media/data/medtextcollector/data/output/crawler/raw,8009
/media/data/wiki_dumps/wiki_non_med_txts,0


Cleaned data bases
['/media/data/medical_books/medical_books_plaintxt/', 22894, 22881, 8301228, 104682611, 2101323]
['/media/data/wiki_dumps/wiki_med_txts', 10485, 10485, 249609, 5194030, 395226]
['/media/data/medtextcollector/data/output/crawler/pages', 29666, 29666, 3805554, 65325417, 481399]


0,1,2,3,4,5
database,files,documents,sentences,tokens,vocabulary
/media/data/medical_books/medical_books_plaintxt/,22894,22881,8301228,104682611,2101323
/media/data/wiki_dumps/wiki_med_txts,10485,10485,249609,5194030,395226
/media/data/medtextcollector/data/output/crawler/pages,29666,29666,3805554,65325417,481399


Union vocabulary2495233
Intersection vocabulary125415


In [65]:
# download and extract wiki dump
import urllib.request

import tqdm
#from tqdm import tnrange, tqdm_notebook

class TqdmUpTo(tqdm):
    """Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
    def update_to(self, b=1, bsize=1, tsize=None):
        """
        b  : int, optional
            Number of blocks transferred so far [default: 1].
        bsize  : int, optional
            Size of each block (in tqdm units) [default: 1].
        tsize  : int, optional
            Total size (in tqdm units). If [default: None] remains unchanged.
        """
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)  # will also set self.n = b * bsize

with TqdmUpTo() as t:  # all optional kwargs
    urllib.request.urlretrieve(config["wiki_dump_url"], config["wiki_dump"], reporthook=t.update_to, data=None)
                                  

                         
                         

# TODO

TypeError: module.__init__() takes at most 2 arguments (3 given)

In [None]:
# extract medical and non medical articles from wiki dump and convert XML to TXT

# TODO

In [None]:
# train medtext classifier

# TODO

In [None]:
# start crawler

# TODO

In [None]:
# extract TXT from PDFs

# TODO

In [None]:
# train embedding / start medword

# TODO