In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr_4919.json to spark_nlp_for_healthcare_spark_ocr_4919.json


In [2]:
license_keys.keys()

dict_keys(['SPARK_NLP_LICENSE', 'SECRET', 'JSL_VERSION', 'PUBLIC_VERSION', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN', 'SPARK_OCR_LICENSE', 'SPARK_OCR_SECRET', 'OCR_VERSION'])

In [3]:
license_keys['JSL_VERSION']

'3.5.0'

In [4]:
license_keys['PUBLIC_VERSION']

'3.4.2'

In [5]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 63 kB/s 
[K     |████████████████████████████████| 142 kB 21.0 MB/s 
[K     |████████████████████████████████| 198 kB 57.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 188 kB 11.3 MB/s 
[K     |████████████████████████████████| 95 kB 3.4 MB/s 
[K     |████████████████████████████████| 66 kB 5.1 MB/s 
[?25h

In [6]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.2
Spark NLP_JSL Version : 3.5.0


In [7]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")
      
    return builder.getOrCreate()

#spark = start(SECRET)

In [8]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_clinical_large","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
    .setLabelCasing("upper") #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical_large download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


In [9]:
clinical_ner.extractParamMap()

{Param(parent='MedicalNerModel_1a8637089929', name='batchSize', doc='Size of every batch'): 64,
 Param(parent='MedicalNerModel_1a8637089929', name='classes', doc='get the tags used to trained this MedicalNerModel'): ['O',
  'B-TREATMENT',
  'I-TREATMENT',
  'B-PROBLEM',
  'I-PROBLEM',
  'B-TEST',
  'I-TEST'],
 Param(parent='MedicalNerModel_1a8637089929', name='includeAllConfidenceScores', doc='whether to include all confidence scores in annotation metadata or just the score of the predicted tag'): False,
 Param(parent='MedicalNerModel_1a8637089929', name='includeConfidence', doc='whether to include confidence scores in annotation metadata'): True,
 Param(parent='MedicalNerModel_1a8637089929', name='inferenceBatchSize', doc='number of sentences to process in a single batch during inference'): 1,
 Param(parent='MedicalNerModel_1a8637089929', name='inputCols', doc='previous annotations columns, if renamed'): ['sentence',
  'token',
  'embeddings'],
 Param(parent='MedicalNerModel_1a8637089

In [10]:
clinical_ner.getClasses()

['O',
 'B-TREATMENT',
 'I-TREATMENT',
 'B-PROBLEM',
 'I-PROBLEM',
 'B-TEST',
 'I-TEST']

In [11]:
clinical_ner.getStorageRef()

'clinical'

In [12]:
from sparknlp_jsl.compatibility import Compatibility 
import pandas as pd

compatibility = Compatibility(spark)

models = compatibility.findVersion('ner') 

models_df = pd.DataFrame([dict(x) for x in list(models)])
models_df

Unnamed: 0,name,sparkVersion,version,language,date,readyToUse
0,nerdl_tumour_demo,2,1.7.3,en,2018-12-19T16:52:37.735,true
1,nercrf_tumour_demo,2,1.7.3,en,2018-12-19T17:23:53.776,true
2,nerdl_tumour_demo,2.4,1.8.0,en,2018-12-22T04:21:25.574,true
3,nercrf_tumour_demo,2.4,1.8.0,en,2018-12-22T04:46:26.992,true
4,nercrf_deid,2.4,1.8.0,en,2018-12-23T00:44:17.698,true
...,...,...,...,...,...,...
453,ner_anatomy_biobert_pipeline,3.0,3.4.1,en,2022-03-21T14:43:26.641,true
454,ner_ade_healthcare_pipeline,3.0,3.4.1,en,2022-03-22T10:16:20.015,true
455,ner_ade_clinical_pipeline,3.0,3.4.1,en,2022-03-21T14:55:30.624,true
456,ner_deid_subentity,3.0,3.4.2,pt,2022-04-13T09:04:03.338,true


In [13]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [19]:
import os
import pandas as pd
from ast import literal_eval

mt_samples = pd.read_csv(os.path.join('gdrive/MyDrive/data/negative_articles.csv'), converters={'MeSHDescriptorName': literal_eval,'MeSHQualifierName': literal_eval}, error_bad_lines=False)
mt_samples.head(10)

Unnamed: 0,PMID,Title,Journal,Abstract,PublicationYear,MeSHDescriptorName,MeSHQualifierName
0,10021330,The surface ectoderm is essential for nephric ...,Development,The nephric duct is the first epithelial tubul...,1999,"[Animals, Bone Morphogenetic Protein 4, Bone M...","[None, None, metabolism, None, None, biosynthe..."
1,10021331,Separation of shoot and floral identity in Ara...,Development,The overall morphology of an Arabidopsis plant...,1999,"[Arabidopsis, Arabidopsis Proteins, Cell Diffe...","[cytology, None, None, biosynthesis, None, Non..."
2,10021332,Novel regulation of the homeotic gene Scr asso...,Development,Homeotic genes are known to be involved in pat...,1999,"[Amino Acid Sequence, Animals, Antigens, Diffe...","[None, None, None, None, None, None, embryolog..."
3,10021333,Apontic binds the translational repressor Brun...,Development,The product of the oskar gene directs posterio...,1999,"[3' Untranslated Regions, Animals, Cell Compar...","[metabolism, None, None, None, genetics, None,..."
4,10021334,Stromal cells mediate retinoid-dependent funct...,Development,The essential role of vitamin A and its metabo...,1999,"[Animals, Cell Differentiation, Down-Regulatio...","[None, None, None, None, None, None, abnormali..."
5,10021336,Inhibition of in vitro enteric neuronal develo...,Development,The terminal colon is aganglionic in mice lack...,1999,"[Animals, Cell Adhesion, Cell Differentiation,...","[None, None, None, None, embryology, pharmacol..."
6,10021337,The Drosophila kismet gene is related to chrom...,Development,The Drosophila kismet gene was identified in a...,1999,"[Amino Acid Sequence, Animals, Bacterial Prote...","[None, None, None, genetics, None, physiology,..."
7,10021338,FGF8 induces formation of an ectopic isthmic o...,Development,Beads containing recombinant FGF8 (FGF8-beads)...,1999,"[Animals, Body Patterning, Brain Stem, Cerebel...","[None, drug effects, drug effects, drug effect..."
8,10021339,Transcriptional repression by the Drosophila g...,Development,Early developmental patterning of the Drosophi...,1999,"[Animals, Animals, Genetically Modified, Bindi...","[None, None, None, genetics, genetics, None, e..."
9,10021341,The homeobox gene Pitx2: mediator of asymmetri...,Development,Left-right asymmetry in vertebrates is control...,1999,"[Abnormalities, Multiple, Activins, Amino Acid...","[embryology, None, None, None, None, None, emb..."


In [20]:
mt_samples_df = spark.createDataFrame(mt_samples)

In [25]:
samples_df = mt_samples_df.withColumnRenamed("Abstract", "text")

In [28]:
samples_df.show()

+--------+--------------------+----------------+--------------------+---------------+--------------------+--------------------+
|    PMID|               Title|         Journal|                text|PublicationYear|  MeSHDescriptorName|   MeSHQualifierName|
+--------+--------------------+----------------+--------------------+---------------+--------------------+--------------------+
|10021330|The surface ectod...|     Development|The nephric duct ...|           1999|[Animals, Bone Mo...|[null, null, meta...|
|10021331|Separation of sho...|     Development|The overall morph...|           1999|[Arabidopsis, Ara...|[cytology, null, ...|
|10021332|Novel regulation ...|     Development|Homeotic genes ar...|           1999|[Amino Acid Seque...|[null, null, null...|
|10021333|Apontic binds the...|     Development|The product of th...|           1999|[3' Untranslated ...|[metabolism, null...|
|10021334|Stromal cells med...|     Development|The essential rol...|           1999|[Animals, Cell Di..

In [29]:
print(samples_df.limit(1).collect()[0]['text'])

The nephric duct is the first epithelial tubule to differentiate from intermediate mesoderm that is essential for all further urogenital development. In this study we identify the domain of intermediate mesoderm that gives rise to the nephric duct and demonstrate that the surface ectoderm is required for its differentiation. Removal of the surface ectoderm resulted in decreased levels of Sim-1 and Pax-2 mRNA expression in mesenchymal nephric duct progenitors, and caused inhibition of nephric duct formation and subsequent kidney development. The surface ectoderm expresses BMP-4 and we show that it is required for the maintenance of high-level BMP-4 expression in lateral plate mesoderm. Addition of a BMP-4-coated bead to embryos lacking the surface ectoderm restored normal levels of Sim-1 and Pax-2 mRNA expression in nephric duct progenitors, nephric duct formation and the initiation of nephrogenesis. Thus, BMP-4 signaling can substitute for the surface ectoderm in supporting nephric duc

In [31]:
result = model.transform(samples_df.limit(100)).cache()

In [32]:
result.show()

+--------+--------------------+----------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    PMID|               Title|         Journal|                text|PublicationYear|  MeSHDescriptorName|   MeSHQualifierName|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|
+--------+--------------------+----------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|10021330|The surface ectod...|     Development|The nephric duct ...|           1999|[Animals, Bone Mo...|[null, null, meta...|[{document, 0, 12...|[{document, 0, 14...|[{token, 0, 2, Th...|[{word_embeddings...|[{named_entity, 0...|[{chun

In [34]:
result.select('token.result','ner.result').show(truncate=100)

+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                              result|                                                                                              result|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|[The, nephric, duct, is, the, first, epithelial, tubule, to, differentiate, from, intermediate, m...|[O, O, O, O, O, O, O, O, O, O, O, B-PROBLEM, I-PROBLEM, O, O, O, O, B-PROBLEM, I-PROBLEM, I-PROBL...|
|[The, overall, morphology, of, an, Arabidopsis, plant, depends, on, the, behaviour, of, its, meri...|[O, O, O, O, B-PROBLEM, I-PROBLEM, I-PROBLEM, O, O, O, O, O, O, O, O, B-PROBLEM, O

In [35]:
result_df = result.select(F.explode(F.arrays_zip("token.result","ner.result")).alias("cols"))\
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)

+---------------+---------+
|          token|ner_label|
+---------------+---------+
|            The|        O|
|        nephric|        O|
|           duct|        O|
|             is|        O|
|            the|        O|
|          first|        O|
|     epithelial|        O|
|         tubule|        O|
|             to|        O|
|  differentiate|        O|
|           from|        O|
|   intermediate|B-PROBLEM|
|       mesoderm|I-PROBLEM|
|           that|        O|
|             is|        O|
|      essential|        O|
|            for|        O|
|            all|B-PROBLEM|
|        further|I-PROBLEM|
|     urogenital|I-PROBLEM|
|    development|I-PROBLEM|
|              .|        O|
|             In|        O|
|           this|   B-TEST|
|          study|   I-TEST|
|             we|        O|
|       identify|        O|
|            the|        O|
|         domain|        O|
|             of|        O|
|   intermediate|B-PROBLEM|
|       mesoderm|I-PROBLEM|
|           that|   

In [36]:
result_df.select("token", "ner_label")\
         .groupBy('ner_label').count()\
         .orderBy('count', ascending=False)\
         .show(truncate=False)

+-----------+-----+
|ner_label  |count|
+-----------+-----+
|O          |16346|
|I-PROBLEM  |2763 |
|B-PROBLEM  |1207 |
|I-TREATMENT|1061 |
|I-TEST     |822  |
|B-TREATMENT|789  |
|B-TEST     |570  |
+-----------+-----+



In [37]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+------------------------------------+---------+
|chunk                               |ner_label|
+------------------------------------+---------+
|intermediate mesoderm               |PROBLEM  |
|all further urogenital development  |PROBLEM  |
|this study                          |TEST     |
|intermediate mesoderm               |PROBLEM  |
|Removal of the surface ectoderm     |TREATMENT|
|decreased levels                    |PROBLEM  |
|Sim-1                               |TEST     |
|mesenchymal nephric duct progenitors|TEST     |
|inhibition of nephric duct formation|PROBLEM  |
|subsequent kidney development       |PROBLEM  |
|BMP-4                               |TREATMENT|
|high-level BMP-4 expression         |TREATMENT|
|a BMP-4-coated bead                 |TREATMENT|
|Sim-1                               |TEST     |
|nephric duct formation              |PROBLEM  |
|BMP-4 signaling                     |TREATMENT|
|nephric duct morphogenesis          |PROBLEM  |
|that inductive inte

In [42]:
# fullAnnotate in LightPipeline


print (samples_df.limit(1).collect()[0]['text'])

light_model = LightPipeline(model)

light_result = light_model.fullAnnotate(samples_df.limit(1).collect()[0]['text'])


chunks = []
entities = []
sentence= []
begin = []
end = []

for n in light_result[0]['ner_chunk']:
        
    begin.append(n.begin)
    end.append(n.end)
    chunks.append(n.result)
    entities.append(n.metadata['entity']) 
    sentence.append(n.metadata['sentence'])
    
    
import pandas as pd

df_clinical = pd.DataFrame({'chunks':chunks, 'begin': begin, 'end':end, 
                   'sentence_id':sentence, 'entities':entities})

df_clinical.head(20)

The nephric duct is the first epithelial tubule to differentiate from intermediate mesoderm that is essential for all further urogenital development. In this study we identify the domain of intermediate mesoderm that gives rise to the nephric duct and demonstrate that the surface ectoderm is required for its differentiation. Removal of the surface ectoderm resulted in decreased levels of Sim-1 and Pax-2 mRNA expression in mesenchymal nephric duct progenitors, and caused inhibition of nephric duct formation and subsequent kidney development. The surface ectoderm expresses BMP-4 and we show that it is required for the maintenance of high-level BMP-4 expression in lateral plate mesoderm. Addition of a BMP-4-coated bead to embryos lacking the surface ectoderm restored normal levels of Sim-1 and Pax-2 mRNA expression in nephric duct progenitors, nephric duct formation and the initiation of nephrogenesis. Thus, BMP-4 signaling can substitute for the surface ectoderm in supporting nephric duc

Unnamed: 0,chunks,begin,end,sentence_id,entities
0,intermediate mesoderm,70,90,0,PROBLEM
1,all further urogenital development,114,147,0,PROBLEM
2,this study,153,162,1,TEST
3,intermediate mesoderm,190,210,1,PROBLEM
4,Removal of the surface ectoderm,327,357,2,TREATMENT
5,decreased levels,371,386,2,PROBLEM
6,Sim-1,391,395,2,TEST
7,mesenchymal nephric duct progenitors,426,461,2,TEST
8,inhibition of nephric duct formation,475,510,2,PROBLEM
9,subsequent kidney development,516,544,2,PROBLEM


In [43]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(light_result[0], label_col='ner_chunk', document_col='document', save_path="display_result.html")

# Change color of an entity label

#visualiser.set_label_colors({'PROBLEM':'#008080', 'TEST':'#800080', 'TREATMENT':'#808080'})
#visualiser.display(light_result[0], label_col='ner_chunk')

# Set label filter

# visualiser.display(light_result[0], label_col='ner_chunk', document_col='document',
                   #labels=['PROBLEM','TEST'])