In [22]:
! pip install -q pyspark==3.2.0 spark-nlp

In [33]:
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType, DataType,ArrayType
from pyspark.sql.functions import udf, struct
from IPython.core.display import display, HTML
import re

  from IPython.core.display import display, HTML


In [34]:
from pyspark.ml import PipelineModel
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp

spark = sparknlp.start(spark32 = True)

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.4.1
Apache Spark version: 3.2.1


In [35]:
stopwords = StopWordsCleaner().getStopWords()

In [36]:
stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

## YAKE Keyword Extractor

In [38]:
import pandas as pd
data = pd.read_csv('mtsamples.csv')
data.head()

Unnamed: 0,Type/Specialty,Sample Name,Description,Transcript,Keywords
0,Allergy / Immunology,Allergic Rhinitis,A 23-year-old white female presents with compl...,SUBJECTIVE: This 23-year-old white female pres...,"allergy / immunology, allergic rhinitis, aller..."
1,Allergy / Immunology,Allergy Evaluation Consult,"Acute allergic reaction, etiology uncertain, h...",HISTORY: A 34-year-old male presents today sel...,"allergy / immunology, keflex, acute allergic r..."
2,Allergy / Immunology,Asthma in a 5-year-old,Mother states he has been wheezing and coughing.,CHIEF COMPLAINT: This 5-year-old male presents...,"allergy / immunology, breathing treatment, air..."
3,Allergy / Immunology,Chronic Sinusitis,Patient having severe sinusitis about two to t...,HISTORY: I had the pleasure of meeting and eva...,"allergy / immunology, nasal congestion, facial..."
4,Allergy / Immunology,Evaluation of Allergies,"Chronic glossitis, xerostomia, probable enviro...",HISTORY: A 55-year-old female presents self-re...,"allergy / immunology, chronic glossitis, xeros..."


In [43]:
data['Transcript'][0]

'SUBJECTIVE: This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: She has no known medicine allergies. OBJECTIVE: Vitals: Weight was 130 pounds and blood pressure 124/78. HEENT: Her throat was mildly erythematous without exudate. Nasal mucosa was erythematous and swollen. Only clear drainage was seen. TMs were clear. Neck: Supple without adenopathy. Lungs: Clear. ASSESS

In [59]:
document = DocumentAssembler() \
    .setInputCol("Transcript") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols("document") \
    .setOutputCol("sentence")

token = Tokenizer() \
    .setInputCols("sentence") \
    .setOutputCol("token") \
    .setContextChars(["(", ")", "?", "!", ".", ","])

keywords = YakeKeywordExtraction() \
    .setInputCols("token") \
    .setOutputCol("keywords") \
    .setMinNGrams(1) \
    .setMaxNGrams(3)\
    .setNKeywords(20)\
    .setStopWords(stopwords)

yake_pipeline = Pipeline(stages=[document, sentenceDetector, token, keywords])

empty_df = spark.createDataFrame([['']]).toDF("Transcript")

yake_Model = yake_pipeline.fit(empty_df)

In [60]:
# LightPipeline

light_model = LightPipeline(yake_Model)

text = """SUBJECTIVE: This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She does have asthma but doest not require daily medication for this and does not think it is flaring up. MEDICATIONS: Her only medication currently is Ortho Tri-Cyclen and the Allegra. ALLERGIES: She has no known medicine allergies. OBJECTIVE: Vitals: Weight was 130 pounds and blood pressure 124/78. HEENT: Her throat was mildly erythematous without exudate. Nasal mucosa was erythematous and swollen. Only clear drainage was seen. TMs were clear. Neck: Supple without adenopathy. Lungs: Clear. ASSESSMENT: Allergic rhinitis. PLAN: 1. She will try Zyrtec instead of Allegra again. Another option will be to use loratadine. She does not think she has prescription coverage so that might be cheaper. 2. Samples of Nasonex two sprays in each nostril given for three weeks. A prescription was written as well."""

light_result = light_model.fullAnnotate(text)[0]

[(s.metadata['sentence'], s.result) for s in light_result['sentence']]

[('0',
  'SUBJECTIVE: This 23-year-old white female presents with complaint of allergies.'),
 ('1',
  'She used to have allergies when she lived in Seattle but she thinks they are worse here.'),
 ('2', 'In the past, she has tried Claritin, and Zyrtec.'),
 ('3', 'Both worked for short time but then seemed to lose effectiveness.'),
 ('4', 'She has used Allegra also.'),
 ('5',
  'She used that last summer and she began using it again two weeks ago.'),
 ('6', 'It does not appear to be working very well.'),
 ('7',
  'She has used over-the-counter sprays but no prescription nasal sprays.'),
 ('8',
  'She does have asthma but doest not require daily medication for this and does not think it is flaring up.'),
 ('9',
  'MEDICATIONS: Her only medication currently is Ortho Tri-Cyclen and the Allegra.'),
 ('10', 'ALLERGIES: She has no known medicine allergies.'),
 ('11', 'OBJECTIVE: Vitals: Weight was 130 pounds and blood pressure 124/78.'),
 ('12', 'HEENT: Her throat was mildly erythematous witho

In [61]:
import pandas as pd

keys_df = pd.DataFrame([(k.result, k.begin, k.end, k.metadata['score'],  k.metadata['sentence']) for k in light_result['keywords']],
                       columns = ['keywords','begin','end','score','sentence'])
keys_df['score'] = keys_df['score'].astype(float)

# ordered by relevance 
keys_df.sort_values(['sentence','score']).head(30)

Unnamed: 0,keywords,begin,end,score,sentence
0,allergies,69,77,0.224708,0
1,used,84,87,0.184242,1
2,allergies,97,105,0.224708,1
3,seattle,125,131,0.536178,1
22,allergies,719,727,0.224708,10
23,weight,749,754,0.687564,11
25,without,840,846,0.428513,12
24,erythematous,827,838,0.428628,12
27,erythematous,874,885,0.428628,13
26,nasal,857,861,0.43084,13


### Keyword dari Dataframe

In [62]:
df = spark.read\
                .option("header", "true")\
                .csv("mtsamples.csv")\
                
df.show(truncate=50)

+--------------------+--------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|      Type/Specialty|                                 Sample Name|                                       Description|                                        Transcript|                                          Keywords|
+--------------------+--------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|Allergy / Immunology|                           Allergic Rhinitis|A 23-year-old white female presents with compla...|SUBJECTIVE: This 23-year-old white female prese...|allergy / immunology, allergic rhinitis, allerg...|
|Allergy / Immunology|                  Allergy Evaluation Consult|Acute allergic reaction, etiology uncertain, ho..

In [63]:
df = df.select('Transcript')

In [64]:
result = yake_pipeline.fit(df).transform(df)

In [65]:
result = result.withColumn('unique_keywords', F.array_distinct("keywords.result"))

In [66]:
result = result.withColumn('unique_keywords', F.array_distinct("keywords.result"))
def highlight(text, keywords):
    for k in keywords:
        text = (re.sub(r'(\b%s\b)'%k, r'<span style="background-color: yellow;">\1</span>', text, flags=re.IGNORECASE))
    return text

In [67]:
highlight_udf = udf(highlight, StringType())

In [69]:
result = result.withColumn("highlighted_keywords",highlight_udf('Transcript','unique_keywords'))
