<a href="https://colab.research.google.com/github/cskyan/lecture-bionlp/blob/master/notebooks/bionlp_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hands-on Tasks for Biomedical Text Mining (BioNLP)
  - Basic Tasks
    * Tokenization
    * POS Tagging, Lemmatization, Stemming
    * Stop Words
  - Advanced Tasks
    * NER
    * Document Classification
  - Neural Network
    * Word Embedding
    * Language Model

# Data
  - [PubMed](https://github.com/cskyan/lecture-bionlp-intro/blob/master/data/pubmed_samples.csv)
  - [BLUE](https://github.com/ncbi-nlp/BLUE_Benchmark)

In [None]:
import pandas as pd
url = 'https://github.com/cskyan/lecture-bionlp/blob/master/data/pubmed_samples.csv?raw=true'
pubmed_df = pd.read_csv(url,index_col='pmid')
paragraph = pubmed_df.iloc[0].text

In [None]:
!wget https://github.com/ncbi-nlp/BLUE_Benchmark/releases/download/0.1/data_v0.2.zip
!unzip data_v0.2.zip

In [None]:
!wget https://github.com/cskyan/lecture-bionlp/blob/master/data/BC5CDR.zip?raw=true -O BC5CDR.zip
!unzip BC5CDR.zip -d data/BC5CDR

# Pre-processing

In [None]:
!pip install ftfy

In [None]:
import ftfy

pubmed_df.text = pubmed_df.text.apply(ftfy.fix_text)

# Tokenization

## [NLTK](https://www.nltk.org/api/nltk.tokenize.html)


###  Sentence segmentation

In [None]:
import nltk
nltk.sent_tokenize(paragraph)

In [None]:
# Download the model
import nltk
nltk.download('punkt')
# Construct the tokenizer
punkt_sent_tknzr = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
punkt_sent_tknzr.tokenize(paragraph)

### Word Tokenization

In [None]:
[' ## '.join(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph)]

#### Penn Treebank Tokenizer

In [None]:
from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
treebank_tknzr = TreebankWordTokenizer()
[' ## '.join(treebank_tknzr.tokenize(sent)) for sent in nltk.sent_tokenize(paragraph)]

### Pipeline

#### Stanford CoreNLP

In [None]:
# Deprecated
# from nltk.parse import corenlp
# stanford_tknzr = corenlp.CoreNLPParser(url=CORENLP_URL)
# [' '.join(sent.leaves()) for sent in stanford_tknzr.parse_text(paragraph[:258])]

In [None]:
# Install stanza and Stanford CoreNLP

!pip install stanza
import os, stanza
corenlp_dir = './corenlp'
os.environ["CORENLP_HOME"] = corenlp_dir
stanza.install_corenlp(dir=corenlp_dir)

In [None]:
CORENLP_URL = 'http://localhost:9001'

In [None]:
# Start Server
# !killall java
from stanza.server import CoreNLPClient
client = CoreNLPClient(
    # annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'],
    annotators=['tokenize', 'ssplit'], 
    memory='4G', 
    endpoint=CORENLP_URL,
    be_quiet=True)
client.start()

In [None]:
# [' '.join([tkn.word for tkn in sent.token]) for sent in client.annotate(paragraph).sentence]

In [None]:
# Remember to close the server when finished
client.stop()

In [None]:
# Or use the with statement
with CoreNLPClient(
    # annotators=['tokenize', 'ssplit', 'pos', 'lemma', 'ner'],
    annotators=['tokenize', 'ssplit'], 
    memory='4G', 
    endpoint=CORENLP_URL,
    be_quiet=True) as client:
    print([' '.join([tkn.word for tkn in sent.token]) for sent in client.annotate(paragraph).sentence])

## [SpaCy](https://spacy.io)

In [None]:
# !pip install -U spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy

spacy.prefer_gpu()
spacy_nlp = spacy.load("en_core_web_sm")

In [None]:
[' ## '.join([word.text for word in sent]) for sent in spacy_nlp(paragraph, disable=['entity']).sents]

### [SciSpaCy](allenai.github.io/scispacy)

In [None]:
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [None]:
import scispacy
scispacy.__version__

In [None]:
import scispacy
import spacy

scispacy_nlp = spacy.load("en_core_sci_sm")

In [None]:
[' ## '.join([word.text for word in sent]) for sent in scispacy_nlp(paragraph).sents]

## [Stanza](https://stanfordnlp.github.io/stanza/)

In [None]:
!pip install stanza
USE_GPU = True
import stanza
stanza.download('en')
stanza_nlp = stanza.Pipeline('en', processors='tokenize', use_gpu=USE_GPU)

In [None]:
[' ## '.join([word.text for word in sent.words]) for sent in stanza_nlp(paragraph).sentences]

### [Stanza for BioNLP](https://doi.org/10.1093/jamia/ocab090)

In [None]:
USE_GPU = True
stanza.download('en', package='craft')
stanza_bionlp = stanza.Pipeline('en', package='craft', processors='tokenize', use_gpu=USE_GPU)
# stanza.download('en', package='mimic')
# stanza_bionlp = stanza.Pipeline('en', package='mimic', use_gpu=USE_GPU)
# stanza_bionlp = stanza.Pipeline('en', package='mimic', processors={'ner':'i2b2'}, use_gpu=USE_GPU)

In [None]:
[' ## '.join([word.text for word in sent.words]) for sent in stanza_bionlp(paragraph).sentences]

# POS tagging, Lemmatization, Stemming

In [None]:
import nltk
text = nltk.sent_tokenize(paragraph)[0]
text

## [NLTK](https://www.nltk.org/api/nltk.stem.*html*)

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
!cd /usr/share/nltk_data/corpora && unzip wordnet.zip

In [None]:
# POS
nltk_pos = nltk.pos_tag(nltk.word_tokenize(text))
print(nltk_pos)

In [None]:
# lemma
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
' ## '.join([wnl.lemmatize(word) for word in nltk.word_tokenize(text)])

In [None]:
# stem
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
ps_stemmer  = PorterStemmer()
sb_stemmer  = PorterStemmer()
print(' ## '.join([ps_stemmer.stem(word) for word in nltk.word_tokenize(text)]))
print(' ## '.join([sb_stemmer.stem(word) for word in nltk.word_tokenize(text)]))

## SpaCy

In [None]:
import spacy

[(word.text, word.pos_, word.lemma_) for word in list(spacy_nlp(text, disable=['entity']).sents)[0]]

## Stanza

In [None]:
stanza_bionlp = stanza.Pipeline('en', package='craft', processors='tokenize,pos,lemma', use_gpu=USE_GPU)

In [None]:
[(word.text, word.pos, word.lemma) for word in stanza_bionlp(text).sentences[0].words]

# Stop Words

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords as nltk_stopwords
nltk_stop_words = set(nltk_stopwords.words('english'))
nltk_stopwords_filter = lambda x: x not in nltk_stop_words

In [None]:
text = nltk.sent_tokenize(paragraph)[0]
tokens = nltk.word_tokenize(text)
print(tokens)
print(list(filter(nltk_stopwords_filter, tokens)))

In [None]:
spacy_stop_words = spacy_nlp.Defaults.stop_words
spacy_stopwords_filter = lambda x: x not in spacy_stop_words

In [None]:
tokens = [word.text for word in list(spacy_nlp(text, disable=['entity']).sents)[0]]
print(tokens)
print(list(filter(spacy_stopwords_filter, tokens)))

# Name Entity Recognition (NER)

## API Calling

In [None]:
!rm bionlp -rf
!git clone https://github.com/cskyan/bionlp.git
!pip install ftfy

In [None]:
from bionlp.spider import pubtator
client = pubtator.PubTatorAPI() 
client.get_concepts_pmid(ctype='all', pmid='28483577')

## Pre-installed NER models

### Spacy

In [None]:
[[(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_) for ent in sent.ents] for sent in spacy_nlp(paragraph).sents]

### Stanza

In [None]:
stanza_bionlp = stanza.Pipeline('en', package='craft', processors='tokenize,pos,lemma,ner', use_gpu=USE_GPU)

In [None]:
[[(ent.text, ent.start_char, ent.end_char, ent.type) for ent in sent.ents] for sent in stanza_nlp(paragraph).sentences]

## BLUE Tasks

In [None]:
bc5cdrdz_train = pd.read_csv('data/BC5CDR/BC5CDR-disease/train.tsv', sep='\t', header=None)
bc5cdrdz_dev = pd.read_csv('data/BC5CDR/BC5CDR-disease/dev.tsv', sep='\t', header=None)
bc5cdrdz_test = pd.read_csv('data/BC5CDR/BC5CDR-disease/test.tsv', sep='\t', header=None)

In [None]:
import itertools
import numpy as np

sep_selector_train = bc5cdrdz_train[0].apply(lambda x: True if x=='.' else False)
sep_selector_train.iloc[-1] = False if sep_selector_train.iloc[-2] else True
int_idx = pd.DataFrame(np.arange(bc5cdrdz_train.shape[0]), index=bc5cdrdz_train.index)
boundaries_train = [0] + list(itertools.chain.from_iterable((int_idx[sep_selector_train.values].values+1).tolist()))

sep_selector_dev = bc5cdrdz_dev[0].apply(lambda x: True if x=='.' else False)
sep_selector_dev.iloc[-1] = False if sep_selector_dev.iloc[-2] else True
int_idx = pd.DataFrame(np.arange(bc5cdrdz_dev.shape[0]), index=bc5cdrdz_dev.index)
boundaries_dev = [0] + list(itertools.chain.from_iterable((int_idx[sep_selector_dev.values].values+1).tolist()))

sep_selector_test = bc5cdrdz_test[0].apply(lambda x: True if x=='.' else False)
sep_selector_test.iloc[-1] = False if sep_selector_test.iloc[-2] else True
int_idx = pd.DataFrame(np.arange(bc5cdrdz_test.shape[0]), index=bc5cdrdz_test.index)
boundaries_test = [0] + list(itertools.chain.from_iterable((int_idx[sep_selector_test.values].values+1).tolist()))

In [None]:
tokens_train = [bc5cdrdz_train.iloc[boundaries_train[sent_bndry]:boundaries_train[sent_bndry+1]][0].apply(str) for sent_bndry in range(len(boundaries_train)-1)]
tokens_dev = [bc5cdrdz_dev.iloc[boundaries_dev[sent_bndry]:boundaries_dev[sent_bndry+1]][0].apply(str) for sent_bndry in range(len(boundaries_dev)-1)]
tokens_test = [bc5cdrdz_test.iloc[boundaries_test[sent_bndry]:boundaries_test[sent_bndry+1]][0].apply(str) for sent_bndry in range(len(boundaries_test)-1)]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(bc5cdrdz_train[3])

text_clf = Pipeline([
  ('tfidf', TfidfVectorizer(analyzer='char_wb', stop_words='english', ngram_range=(2, 2), use_idf=True)),
  ('clf', OneVsRestClassifier(svm.SVC())),])
text_clf.fit(bc5cdrdz_train[0].fillna('').values, y)

In [None]:
predict_test = text_clf.predict(bc5cdrdz_test[0].fillna('').values)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(le.transform(bc5cdrdz_test[3]), predict_test, target_names=le.inverse_transform(text_clf.classes_)))

# Document Classification

## Bag-of-words (N-Gram)

In [None]:
import spacy, stanza
from sklearn.feature_extraction.text import CountVectorizer

spacy_nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(doc):
  return [x.orth_ for x in spacy_nlp(doc, disable=['ner'])]

def stanza_tokenizer(doc):
  return [x.orth_ for x in nlp(doc)]

TKNZR = [None, nltk.word_tokenize, spacy_tokenizer]
tknzr = TKNZR[2]

In [None]:
cnt_vctrzr = CountVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2))
cnt_X = cnt_vctrzr.fit_transform(pubmed_df.text.values)
print('Numeric Features:')
pd.DataFrame.sparse.from_spmatrix(cnt_X, index=pubmed_df.index, columns=cnt_vctrzr.get_feature_names())

In [None]:
bin_vctrzr = CountVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), binary=True)
bin_X = bin_vctrzr.fit_transform(pubmed_df.text.values)
print('Binary Features:')
pd.DataFrame.sparse.from_spmatrix(bin_X, index=pubmed_df.index, columns=cnt_vctrzr.get_feature_names())

## Normalization

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

In [None]:
tf_vctrzr = TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=False)
tf_X = tf_vctrzr.fit_transform(pubmed_df.text.values)
print('TF Features:')
pd.DataFrame.sparse.from_spmatrix(tf_X, index=pubmed_df.index, columns=tf_vctrzr.get_feature_names())

In [None]:
tfidf_vctrzr = TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=True)
tfidf_X = tfidf_vctrzr.fit_transform(pubmed_df.text.values)
print('TF-IDF Features:')
pd.DataFrame.sparse.from_spmatrix(tfidf_X, index=pubmed_df.index, columns=tfidf_vctrzr.get_feature_names())

In [None]:
from sklearn import svm

clf = svm.SVC()

y = [0, 1, 1, 0, 1]
clf.fit(tfidf_X, y)

In [None]:
from sklearn.pipeline import Pipeline
# text_clf = Pipeline([
#   ('vect', CountVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2))),
#   ('tfidf', TfidfTransformer(use_idf=True)),
#   ('clf', svm.SVC()),])

# Equivalent to
text_clf = Pipeline([
  ('tfidf', TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=True)),
  ('clf', svm.SVC()),])

In [None]:
text_clf.fit(pubmed_df.text.values, y)

## Multi-class & Multi-label



In [None]:
from sklearn.multiclass import OneVsRestClassifier

y = [1, 2, 0, 0, 2]

text_clf = Pipeline([
  ('tfidf', TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=True)),
  ('clf', OneVsRestClassifier(svm.SVC())),])
text_clf.fit(pubmed_df.text.values, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier

Y = [[1,0,1],[0,1,0],[1,1,1],[0,0,0],[0,0,1]]

text_clf = Pipeline([
  ('tfidf', TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=True)),
  ('clf', RandomForestClassifier(n_estimators=10)),])
text_clf.fit(pubmed_df.text.values, Y)

## BLUE Tasks

In [None]:
chemprot_train = pd.read_csv('data/ChemProt/train.tsv', sep='\t', index_col='index')
chemprot_dev = pd.read_csv('data/ChemProt/dev.tsv', sep='\t', index_col='index')
chemprot_test = pd.read_csv('data/ChemProt/test.tsv', sep='\t', index_col='index')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(chemprot_train.label)

text_clf = Pipeline([
  ('tfidf', TfidfVectorizer(analyzer='word', tokenizer=tknzr, stop_words='english', ngram_range=(2, 2), use_idf=True)),
  ('clf', OneVsRestClassifier(svm.SVC())),])
text_clf.fit(chemprot_train.sentence.values, y)

In [None]:
predict_test = text_clf.predict(chemprot_test.sentence.values)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(le.transform(chemprot_test.label), predict_test, target_names=le.inverse_transform(text_clf.classes_)))

# Word Embedding

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [None]:
# model_path = 'PATH_TO_W2V_MODEL'
# model = KeyedVectors.load(model_path)

In [None]:
# Pre-process corpus
corpus = pubmed_df.text.values
processed_corpus = [list(filter(nltk_stopwords_filter, nltk.word_tokenize(document))) for document in corpus]

In [None]:
# Train the model
w2v_model = Word2Vec(sentences=processed_corpus, size=100, window=5, min_count=1, workers=4)
w2v_model.save('word2vec.model')

In [None]:
# Load the model if it exists
model_path = 'word2vec.model'
w2v_model = KeyedVectors.load(model_path)

In [None]:
def word2idx(lm_model, word, inexistence=-1):
  try:
    idx = lm_model.wv.vocab[word.lower()].index
  except KeyError as e:
    # print('\'%s\' is not in the vocabulary!' % word)
    return inexistence
  else:
    return idx


def get_embedding_layer(lm_model, **kwargs):
  weights = lm_model.wv.vectors
  from keras.layers import Embedding
  from keras.initializers import Constant
  return Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights], **kwargs)

In [None]:
import numpy as np
from keras.preprocessing import sequence
from keras.layers import Input, LSTM, Dense, Dropout
from keras.models import Model
from tensorflow.keras.optimizers import SGD


# Construct the model
X_inputs = Input(shape=(128,), dtype='int64', name='X')
embd_layer = get_embedding_layer(w2v_model)
lstm_layer = LSTM(128, name='LSTM')(embd_layer(X_inputs))
hidden_state = Dense(32, activation='relu', name='Hidden-State')(lstm_layer)
output = Dropout(0.2, name='Dropout')(Dense(1, activation='relu', name='CLF')(hidden_state))

# Compile the model
model = Model(X_inputs, output)
optmzr = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=optmzr, loss='binary_crossentropy', metrics=['acc', 'mse'])

# Fed inputs
input_seqs = [[word2idx(w2v_model, word, inexistence=len(w2v_model.wv.vectors)-1) for word in doc] for doc in processed_corpus]
input_ids = sequence.pad_sequences(input_seqs, maxlen=128, dtype='int64', padding='post', truncating='post', value=len(w2v_model.wv.vectors)-1)
y = np.array([0, 1, 1, 0, 1])
model.fit(input_ids, y)

## Execercise: Use word embedding for the BLUE classification tasks

# Language Model

In [None]:
!pip install transformers
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD" -O biobert_weights && rm -rf /tmp/cookies.txt
!tar -xzf biobert_weights
!transformers-cli convert --model_type bert --tf_checkpoint biobert_v1.1_pubmed/model.ckpt-1000000 --config biobert_v1.1_pubmed/bert_config.json --pytorch_dump_output biobert_v1.1_pubmed/pytorch_model.bin
!mv biobert_v1.1_pubmed/bert_config.json biobert_v1.1_pubmed/config.json

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import BertConfig, BertTokenizer, BertModel, AutoTokenizer, BertForSequenceClassification

In [None]:
# Load models
tokenizer = AutoTokenizer.from_pretrained('biobert_v1.1_pubmed')
model = BertForSequenceClassification.from_pretrained('biobert_v1.1_pubmed')
loss_func = nn.CrossEntropyLoss()

In [None]:
# Construct inputs and get outputs
inputs = tokenizer(pubmed_df.text.values.tolist(), padding=True, truncation=True, return_tensors="pt")
output = model(**inputs)
logits = output.logits
# predictions = F.softmax(output.logits, -1).argmax(-1)

In [None]:
# Calculate the loss and backward propergate for training
y = torch.tensor([0, 1, 1, 0, 1])
loss = loss_func(logits, y)
loss.backward()

## Execercise: Use BERT for BLUE NER tasks

# The End