first commit

dmis-lab · Jan 6, 2022 · 20cef6b · 20cef6b
commit 20cef6b
Show file tree

Hide file tree

Showing 85 changed files with 12,453 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,113 @@
+# BERN
+logs
+resources
+det_token_test*
+bern2_spacy
+input
+output
+vendor
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/README.md b/README.md
@@ -0,0 +1,85 @@
+# BERN2
+
+We present **BERN2** (Advanced **B**iomedical **E**ntity **R**ecognition and **N**ormalization), a tool that improves the previous neural network-based NER tool by employing a multi-task NER model and neural network-based NEN models to achieve much faster and more accurate inference. This repository provides a way to host your own BERN2 server. See our [paper](https://arxiv.org/) for more details.
+
+***** **Try BERN2 at [http://bern2.korea.ac.kr](http://bern2.korea.ac.kr)** ***** 
+
+## Installing BERN2
+
+You first need to install BERN2 and its dependencies.
+
+```bash
+# Install torch with conda (please check your CUDA version)
+conda create -n bern2 python=3.7
+conda activate bern2
+conda install pytorch==1.9.0 cudatoolkit=10.2 -c pytorch
+conda install faiss-gpu libfaiss-avx2 -c conda-forge
+
+# Check if cuda is available
+python -c "import torch;print(torch.cuda.is_available())"
+
+# Install BERN2
+git clone git@github.com:dmis-lab/BERN2.git
+cd BERN2
+pip install -r requirements.txt
+
+```
+
+(Optional) If you want to use mongodb as a caching database, you need to install and run it.
+```
+# https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/#install-mongodb-community-edition-using-deb-packages
+sudo systemctl start mongod
+sudo systemctl status mongod
+```
+
+Then, you need to download resources (e.g., external modules or dictionaries) for running BERN2. Note that you will need 70GB of free disk space.
+
+```
+wget http://nlp.dmis.korea.edu/projects/bern2/resources.tar.gz
+tar -zxvf resources.tar.gz
+rm -rf resources.tar.gz
+# install CRF
+cd resources/GNormPlusJava/CRF
+./configure --prefix="$HOME"
+make
+make install
+cd ../../..
+```
+
+## Running BERN2
+
+The following command runs BERN2.
+```
+export CUDA_VISIBLE_DEVICES=0
+cd scripts
+bash run_bern2.sh
+```
+
+(Optional) To restart BERN2, you need to run the following commands.
+```
+export CUDA_VISIBLE_DEVICES=0
+cd scripts
+bash stop_bern2.sh
+bash start_bern2.sh
+```
+
+## Annotations
+
+Click [here](http://nlp.dmis.korea.edu/projects/bern2/annotations/anntation_v1.1.tar.gz) to download the annotations (NER and normalization) for 25.7+ millions of PubMed articles (From pubmed21n0001 to pubmed21n1057 (2021.01.12)) (Compressed, 18 GB).
+
+The data provided by BERN2 is post-processed and may differ from the most current/accurate data available from [U.S. National Library of Medicine (NLM)](https://www.nlm.nih.gov/).
+
+## Citation
+```bibtex
+@article{sung2021bern2,
+    title={BERN2: an advanced neural biomedical namedentity recognition and normalization tool}, 
+    author={Sung, Mujeen and Jeong, Minbyul and Choi, Yonghwa and Kim, Donghyeon and Lee, Jinhyuk and Kang, Jaewoo},
+    year={2022},
+    eprint={TBD},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+## Contact Information
+For help or issues using BERN2, please submit a GitHub issue. Please contact Mujeen Sung (`mujeensung (at) korea.ac.kr`), or Minbyul Jeong (`minbyuljeong (at) korea.ac.kr`) for communication related to BERN2.
diff --git a/app/__init__.py b/app/__init__.py
@@ -0,0 +1,165 @@
+import os
+import json
+
+from flask import Flask, render_template, request
+
+try:
+    from .result_parser import ResultParser
+except ImportError:
+    from result_parser import ResultParser
+
+# Import Engine
+import bern2
+
+import time
+
+def del_keys_from_dict(_dict, keys):
+    for _key in keys:
+        _dict.pop(_key, None)
+    return _dict
+
+def create_app(args):
+    app = Flask(__name__, instance_relative_config=False)
+    app.config.from_mapping(
+        SECRET_KEY="@#$%^BERN2%^FLASK@#$%^"
+    )
+
+    print(app.root_path)
+
+    # LOAD MODEL
+    if args.front_dev:
+        model = None
+    else:
+        model = bern2.BERN2(
+            mtner_home=args.mtner_home,
+            mtner_port=args.mtner_port,
+            gnormplus_home=args.gnormplus_home,
+            gnormplus_port=args.gnormplus_port,
+            tmvar2_home=args.tmvar2_home,
+            tmvar2_port=args.tmvar2_port,
+            gene_norm_port=args.gene_norm_port,
+            disease_norm_port=args.disease_norm_port,
+            cache_host=args.cache_host,
+            cache_port=args.cache_port,
+            use_neural_normalizer=args.use_neural_normalizer,
+            keep_files=args.keep_files
+        )
+
+    r_parser = ResultParser()
+
+    @app.route('/', methods=['GET'])
+    def index():
+        return render_template('index.html', debug=False)
+
+    @app.route('/documentation', methods=['GET'])
+    def doc_view():
+        return render_template('documentation.html')
+
+    @app.route('/debug', methods=['GET'])
+    def debug():
+        return render_template('index.html', debug=True)
+
+    @app.route('/pubmed/<pmids>', methods=['GET'])
+    def pubmed_api(pmids):
+        pmids = [pmid.strip() for pmid in pmids.split(",")]
+        if len(pmids) == 0:
+            return "[]"
+
+        result_dicts = [model.annotate_pmid(pmid=pmid) for pmid in pmids]
+        for r in result_dicts:
+            del_keys_from_dict(r, ["sourcedb", "sourceid", "project", "elapse_time"])
+        return json.dumps(result_dicts, sort_keys=True)
+
+    @app.route('/plain', methods=['POST'])
+    def plain_api():
+        params = request.get_json()
+        sample_text = params['text']
+
+        # annotate input
+        result_dict = model.annotate_text(text=sample_text)
+        del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])
+
+        return json.dumps(result_dict, sort_keys=True)
+
+
+
+    @app.route('/senddata', methods=['POST'])
+    def send_data():
+        start = time.time()
+
+        res_items = []
+        draw_keys = json.loads(request.form['draw_keys'])
+        req_type = request.form['req_type']
+        # is_neural_normalized = (request.form['use_neural'] == 'true')
+        # print(is_neural_normalized)
+        _debug = False
+        if 'debug' in request.form:
+            if request.form['debug'] == 'True':
+                _debug = True
+
+        # print("DEBUG:", _debug)
+
+        if req_type == "text":
+            sample_text = request.form['sample_text']
+            # parse from BERN2 Model
+            if not args.front_dev:
+                result_dict = model.annotate_text(text=sample_text)
+            else:
+                dummy_path = os.path.join(app.root_path, "temp/dummy1_20211129.json")
+                with open(dummy_path, 'r') as rf:
+                    result_dict = json.load(rf)
+            _code, parse_res, tooltip_box, keys_in_dict = r_parser.parse_result(result_dict, draw_keys, result_id="text")
+
+            if not _debug:
+                del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])
+
+            latency = time.time() - start
+
+            res_items.append({
+                'parsed_response': parse_res,
+                'tooltip_box': tooltip_box,
+                'keys': {k: r_parser.entity_type_dict[k] for k in keys_in_dict}
+            })
+
+            return render_template('result_text.html', result_items=res_items, latency = f'{latency*1000:9.2f}', result_str=json.dumps(result_dict, sort_keys=True, indent=4))
+        elif req_type == "pmid":
+            sample_pmid = request.form['sample_text']
+            _pmids = list(map(str.strip, sample_pmid.split(",")))
+
+            if not args.front_dev:
+                result_dicts = [model.annotate_pmid(pmid=pmid) for pmid in _pmids]
+            else:
+                dummy_path = os.path.join(app.root_path, "temp/dummy2_20111129.json")
+                with open(dummy_path, 'r') as rf:
+                    result_dicts = json.load(rf)
+
+            pmid2result_dicts = {f"{result_dict['pmid']}_{i}":result_dict for i, result_dict in enumerate(result_dicts)}
+
+            for _pmid, result_dict in pmid2result_dicts.items():
+                _code, parse_res, tooltip_box, keys_in_dict = r_parser.parse_result(result_dict, draw_keys, result_id=_pmid)
+
+                if _code == "error":
+                    # TODO: logging ERROR case
+                    print("ERROR PMID:", _pmid)
+
+                if not _debug:
+                    del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])
+
+                legend_items = {k: r_parser.entity_type_dict[k] for k in keys_in_dict}
+
+                res_item = {
+                    'parsed_response': parse_res,
+                    'tooltip_box': tooltip_box,
+                    'keys': legend_items
+                }
+                if 'pmid' in result_dict.keys():
+                    res_item['title'] = result_dict['pmid']
+
+                res_items.append(res_item)
+
+
+            latency = time.time() - start
+
+            return render_template('result_text.html', result_items=res_items, latency = f'{latency*1000:9.2f}', result_str=json.dumps(result_dicts, sort_keys=True, indent=4))
+
+    return app