Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mjeensung committed Jan 6, 2022
0 parents commit 20cef6b
Show file tree
Hide file tree
Showing 85 changed files with 12,453 additions and 0 deletions.
113 changes: 113 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# BERN
logs
resources
det_token_test*
bern2_spacy
input
output
vendor

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
85 changes: 85 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# BERN2

We present **BERN2** (Advanced **B**iomedical **E**ntity **R**ecognition and **N**ormalization), a tool that improves the previous neural network-based NER tool by employing a multi-task NER model and neural network-based NEN models to achieve much faster and more accurate inference. This repository provides a way to host your own BERN2 server. See our [paper](https://arxiv.org/) for more details.

***** **Try BERN2 at [http://bern2.korea.ac.kr](http://bern2.korea.ac.kr)** *****

## Installing BERN2

You first need to install BERN2 and its dependencies.

```bash
# Install torch with conda (please check your CUDA version)
conda create -n bern2 python=3.7
conda activate bern2
conda install pytorch==1.9.0 cudatoolkit=10.2 -c pytorch
conda install faiss-gpu libfaiss-avx2 -c conda-forge

# Check if cuda is available
python -c "import torch;print(torch.cuda.is_available())"

# Install BERN2
git clone git@github.com:dmis-lab/BERN2.git
cd BERN2
pip install -r requirements.txt

```

(Optional) If you want to use mongodb as a caching database, you need to install and run it.
```
# https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/#install-mongodb-community-edition-using-deb-packages
sudo systemctl start mongod
sudo systemctl status mongod
```

Then, you need to download resources (e.g., external modules or dictionaries) for running BERN2. Note that you will need 70GB of free disk space.

```
wget http://nlp.dmis.korea.edu/projects/bern2/resources.tar.gz
tar -zxvf resources.tar.gz
rm -rf resources.tar.gz
# install CRF
cd resources/GNormPlusJava/CRF
./configure --prefix="$HOME"
make
make install
cd ../../..
```

## Running BERN2

The following command runs BERN2.
```
export CUDA_VISIBLE_DEVICES=0
cd scripts
bash run_bern2.sh
```

(Optional) To restart BERN2, you need to run the following commands.
```
export CUDA_VISIBLE_DEVICES=0
cd scripts
bash stop_bern2.sh
bash start_bern2.sh
```

## Annotations

Click [here](http://nlp.dmis.korea.edu/projects/bern2/annotations/anntation_v1.1.tar.gz) to download the annotations (NER and normalization) for 25.7+ millions of PubMed articles (From pubmed21n0001 to pubmed21n1057 (2021.01.12)) (Compressed, 18 GB).

The data provided by BERN2 is post-processed and may differ from the most current/accurate data available from [U.S. National Library of Medicine (NLM)](https://www.nlm.nih.gov/).

## Citation
```bibtex
@article{sung2021bern2,
title={BERN2: an advanced neural biomedical namedentity recognition and normalization tool},
author={Sung, Mujeen and Jeong, Minbyul and Choi, Yonghwa and Kim, Donghyeon and Lee, Jinhyuk and Kang, Jaewoo},
year={2022},
eprint={TBD},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```

## Contact Information
For help or issues using BERN2, please submit a GitHub issue. Please contact Mujeen Sung (`mujeensung (at) korea.ac.kr`), or Minbyul Jeong (`minbyuljeong (at) korea.ac.kr`) for communication related to BERN2.
165 changes: 165 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import os
import json

from flask import Flask, render_template, request

try:
from .result_parser import ResultParser
except ImportError:
from result_parser import ResultParser

# Import Engine
import bern2

import time

def del_keys_from_dict(_dict, keys):
for _key in keys:
_dict.pop(_key, None)
return _dict

def create_app(args):
app = Flask(__name__, instance_relative_config=False)
app.config.from_mapping(
SECRET_KEY="@#$%^BERN2%^FLASK@#$%^"
)

print(app.root_path)

# LOAD MODEL
if args.front_dev:
model = None
else:
model = bern2.BERN2(
mtner_home=args.mtner_home,
mtner_port=args.mtner_port,
gnormplus_home=args.gnormplus_home,
gnormplus_port=args.gnormplus_port,
tmvar2_home=args.tmvar2_home,
tmvar2_port=args.tmvar2_port,
gene_norm_port=args.gene_norm_port,
disease_norm_port=args.disease_norm_port,
cache_host=args.cache_host,
cache_port=args.cache_port,
use_neural_normalizer=args.use_neural_normalizer,
keep_files=args.keep_files
)

r_parser = ResultParser()

@app.route('/', methods=['GET'])
def index():
return render_template('index.html', debug=False)

@app.route('/documentation', methods=['GET'])
def doc_view():
return render_template('documentation.html')

@app.route('/debug', methods=['GET'])
def debug():
return render_template('index.html', debug=True)

@app.route('/pubmed/<pmids>', methods=['GET'])
def pubmed_api(pmids):
pmids = [pmid.strip() for pmid in pmids.split(",")]
if len(pmids) == 0:
return "[]"

result_dicts = [model.annotate_pmid(pmid=pmid) for pmid in pmids]
for r in result_dicts:
del_keys_from_dict(r, ["sourcedb", "sourceid", "project", "elapse_time"])
return json.dumps(result_dicts, sort_keys=True)

@app.route('/plain', methods=['POST'])
def plain_api():
params = request.get_json()
sample_text = params['text']

# annotate input
result_dict = model.annotate_text(text=sample_text)
del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])

return json.dumps(result_dict, sort_keys=True)



@app.route('/senddata', methods=['POST'])
def send_data():
start = time.time()

res_items = []
draw_keys = json.loads(request.form['draw_keys'])
req_type = request.form['req_type']
# is_neural_normalized = (request.form['use_neural'] == 'true')
# print(is_neural_normalized)
_debug = False
if 'debug' in request.form:
if request.form['debug'] == 'True':
_debug = True

# print("DEBUG:", _debug)

if req_type == "text":
sample_text = request.form['sample_text']
# parse from BERN2 Model
if not args.front_dev:
result_dict = model.annotate_text(text=sample_text)
else:
dummy_path = os.path.join(app.root_path, "temp/dummy1_20211129.json")
with open(dummy_path, 'r') as rf:
result_dict = json.load(rf)
_code, parse_res, tooltip_box, keys_in_dict = r_parser.parse_result(result_dict, draw_keys, result_id="text")

if not _debug:
del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])

latency = time.time() - start

res_items.append({
'parsed_response': parse_res,
'tooltip_box': tooltip_box,
'keys': {k: r_parser.entity_type_dict[k] for k in keys_in_dict}
})

return render_template('result_text.html', result_items=res_items, latency = f'{latency*1000:9.2f}', result_str=json.dumps(result_dict, sort_keys=True, indent=4))
elif req_type == "pmid":
sample_pmid = request.form['sample_text']
_pmids = list(map(str.strip, sample_pmid.split(",")))

if not args.front_dev:
result_dicts = [model.annotate_pmid(pmid=pmid) for pmid in _pmids]
else:
dummy_path = os.path.join(app.root_path, "temp/dummy2_20111129.json")
with open(dummy_path, 'r') as rf:
result_dicts = json.load(rf)

pmid2result_dicts = {f"{result_dict['pmid']}_{i}":result_dict for i, result_dict in enumerate(result_dicts)}

for _pmid, result_dict in pmid2result_dicts.items():
_code, parse_res, tooltip_box, keys_in_dict = r_parser.parse_result(result_dict, draw_keys, result_id=_pmid)

if _code == "error":
# TODO: logging ERROR case
print("ERROR PMID:", _pmid)

if not _debug:
del_keys_from_dict(result_dict, ["sourcedb", "sourceid", "project", "elapse_time"])

legend_items = {k: r_parser.entity_type_dict[k] for k in keys_in_dict}

res_item = {
'parsed_response': parse_res,
'tooltip_box': tooltip_box,
'keys': legend_items
}
if 'pmid' in result_dict.keys():
res_item['title'] = result_dict['pmid']

res_items.append(res_item)


latency = time.time() - start

return render_template('result_text.html', result_items=res_items, latency = f'{latency*1000:9.2f}', result_str=json.dumps(result_dicts, sort_keys=True, indent=4))

return app

0 comments on commit 20cef6b

Please sign in to comment.