<a href="https://colab.research.google.com/github/gokhanturer/JSL/blob/main/Document_classifier_MTSamples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Setup

In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr.json to spark_nlp_for_healthcare_spark_ocr.json


In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 71 kB/s 
[K     |████████████████████████████████| 140 kB 22.1 MB/s 
[K     |████████████████████████████████| 198 kB 69.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 144 kB 10.0 MB/s 
[K     |████████████████████████████████| 95 kB 3.4 MB/s 
[K     |████████████████████████████████| 66 kB 6.4 MB/s 
[?25h

In [3]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())


spark = sparknlp_jsl.start(license_keys['SECRET'])

spark

Spark NLP Version : 3.4.0
Spark NLP_JSL Version : 3.4.0


In [4]:
import pandas as pd
import numpy as np

from pyspark.sql import functions as F

from pyspark.sql.functions import lit

from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

from pyspark.ml.classification import LogisticRegression , RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
Gastroenterology = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/train-data/Gastroenterology/*.txt")
sdf_gastroenterology = Gastroenterology.toDF(schema=['path','text'])

Neurology = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/train-data/Neurology/*.txt")
sdf_neurology = Neurology.toDF(schema=['path','text'])

Orthopedic = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/train-data/Orthopedic/*.txt")
sdf_orthopedic = Orthopedic.toDF(schema=['path', 'text'])

Radiology = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/train-data/Radiology/*.txt")
sdf_radiology = Radiology.toDF(schema=['path','text'])

Urology = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/train-data/Urology/*.txt")
sdf_urology = Urology.toDF(schema=['path','text'])


In [None]:
sdf_gastroenterology = sdf_gastroenterology.withColumn("category", lit('gastroenterology'))
sdf_neurology = sdf_neurology.withColumn("category", lit('neurology'))
sdf_orthopedic = sdf_orthopedic.withColumn("category", lit('orthopedic'))
sdf_radiology = sdf_radiology.withColumn("category", lit('radiology'))
sdf_urology = sdf_urology.withColumn("category", lit('urology'))

In [None]:
sdf_gastroenterology = sdf_gastroenterology.select('text','category')
sdf_neurology = sdf_neurology.select('text','category')
sdf_orthopedic = sdf_orthopedic.select('text','category')
sdf_radiology = sdf_radiology.select('text','category')
sdf_urology = sdf_urology.select('text','category')

In [None]:
from functools import reduce  
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)

sdf = unionAll(sdf_gastroenterology, sdf_neurology, sdf_orthopedic, sdf_radiology, sdf_urology)

In [None]:
sdf.printSchema()

root
 |-- text: string (nullable = true)
 |-- category: string (nullable = false)



## Data Preprocessing and Save


In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

cleanUpPatterns = ["<[^>]*>"]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement("") \
    .setPolicy("pretty_all") \
    .setLowercase(False)

docPatternRemoverPipeline = Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer])
    
empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = docPatternRemoverPipeline.fit(empty_df)

In [None]:
result = pipelineModel.transform(sdf)

result = result.select("category", F.explode('normalizedDocument.result').alias('text'))

result.printSchema()

#result.show()

result.write.parquet("/content/drive/MyDrive/JSLTask/mtsamples_test.parquet")

## Read MTSamples Data

In [6]:
spark_df = spark.read.parquet("/content/drive/MyDrive/JSLTask/mtsamples_test.parquet")
spark_df.printSchema()
#spark_df.show(truncate=False)

root
 |-- category: string (nullable = true)
 |-- text: string (nullable = true)



In [None]:
spark_df.groupBy("category").count().show()

+----------------+-----+
|        category|count|
+----------------+-----+
|gastroenterology|  157|
|      orthopedic|  223|
|       neurology|  143|
|         urology|  115|
|       radiology|  188|
+----------------+-----+



In [7]:
spark_df = spark_df.filter(spark_df.text != "")
# None values in the text column  were removed 

In [None]:
spark_df.groupBy("category").count().show()

+----------------+-----+
|        category|count|
+----------------+-----+
|gastroenterology|  157|
|      orthopedic|  222|
|       neurology|  141|
|         urology|  110|
|       radiology|  188|
+----------------+-----+



In [8]:
!wget https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt


# Since we will build a model with clinical data, clinical stop words data was used instead of general stopwords data.

--2022-02-07 22:02:39--  https://raw.githubusercontent.com/kavgan/clinical-concepts/master/clinical-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6582 (6.4K) [text/plain]
Saving to: ‘clinical-stopwords.txt’


2022-02-07 22:02:39 (74.1 MB/s) - ‘clinical-stopwords.txt’ saved [6582/6582]



In [9]:
with open ('clinical-stopwords.txt', 'r') as f:
    clinicalstops = f.readlines()

# LogReg with CV

In [10]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text")\
    .setOutputCol("document")\

tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setCleanupPatterns(["[^\w\d\s]"])

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setStopWords(clinicalstops)\
    .setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc") \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

finisher = Finisher() \
    .setInputCols(["lemma"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

nlpPipeline = Pipeline(stages=[
            documentAssembler,
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemmatizer,
            finisher,
            countVectors,
            label_stringIdx
            ])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [12]:
lr_cv = nlpPipeline.fit(spark_df)

processed_cv = lr_cv.transform(spark_df)

processed_cv.count()

818

In [13]:
processed_cv.select('features').show(5, truncate=80)

+--------------------------------------------------------------------------------+
|                                                                        features|
+--------------------------------------------------------------------------------+
|(4848,[0,1,2,3,4,6,7,8,12,13,15,16,17,20,21,25,27,28,32,33,35,36,45,47,48,50,...|
|(4848,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,20,21,22,25,27,28,29,30,31...|
|(4848,[0,1,2,3,4,5,6,7,8,10,12,13,16,20,23,24,25,27,28,29,30,31,32,34,37,43,4...|
|(4848,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,16,17,18,19,20,23,24,25,26,28,29,30,31...|
|(4848,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17,19,20,22,23,25,26,27,28,29,30...|
+--------------------------------------------------------------------------------+
only showing top 5 rows



In [None]:
processed_cv.select("text","features","label","category").show(truncate = 50)

+--------------------------------------------------+--------------------------------------------------+-----+----------+
|                                              text|                                          features|label|  category|
+--------------------------------------------------+--------------------------------------------------+-----+----------+
| PREOPERATIVE DIAGNOSES 1. EMG-proven left carp...|(4848,[0,1,2,3,4,6,7,8,12,13,15,16,17,20,21,25,...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSIS: Bunion, left foot. POS...|(4848,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16...|  0.0|orthopedic|
| RICE stands for the most important elements of...|(4848,[0,1,2,3,4,5,6,7,8,10,12,13,16,20,23,24,2...|  0.0|orthopedic|
| The patient is an 84-year-old retired male who...|(4848,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,16,17,18...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSES: 1. Left carpal tunnel ...|(4848,[0,1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,17...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSIS: Hernia

In [14]:
(trainingData, testData) = processed_cv.randomSplit([0.8, 0.2], seed = 42)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 679
Test Dataset Count: 139


In [None]:
lr = LogisticRegression(maxIter = 10, regParam=0.3)

lrModel = lr.fit(trainingData)

predictions_cv = lrModel.transform(testData)

In [None]:
preds_df = predictions_cv.select('category','text',"prediction",'label').toPandas()

print (classification_report(preds_df['label'], preds_df['prediction']))
print (accuracy_score(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75        30
         1.0       0.70      0.82      0.76        34
         2.0       0.97      0.88      0.92        32
         3.0       0.50      0.47      0.48        17
         4.0       1.00      0.96      0.98        26

    accuracy                           0.80       139
   macro avg       0.78      0.77      0.78       139
weighted avg       0.81      0.80      0.80       139

0.7985611510791367


In [None]:
preds_df1 = predictions_cv.select("label").toPandas()

preds = pd.DataFrame(confusion_matrix(list(preds_df.label.astype(int)), list(preds_df.prediction.astype(int))), columns = np.unique(preds_df1['label']), index =  np.unique(preds_df1['label']))

preds

Unnamed: 0,0.0,1.0,2.0,3.0,4.0
0.0,22,5,0,3,0
1.0,2,28,0,4,0
2.0,0,3,28,1,0
3.0,5,4,0,8,0
4.0,0,0,1,0,25


In [None]:
predictions_cv.select("label","category").distinct().show()

+-----+----------------+
|label|        category|
+-----+----------------+
|  4.0|         urology|
|  1.0|       radiology|
|  3.0|       neurology|
|  2.0|gastroenterology|
|  0.0|      orthopedic|
+-----+----------------+



# LogReg with TF-IDF

In [15]:
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

nlp_pipeline_tf = Pipeline(
    stages=[documentAssembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemmatizer, 
            finisher,
            hashingTF,
           idf,
           label_stringIdx])

nlp_model_tf = nlp_pipeline_tf.fit(spark_df)

processed_tf = nlp_model_tf.transform(spark_df)

processed_tf.select("text","features","label","category").show(truncate = 50)

print("processed_tf.count : " +  str(processed_tf.count()))

+--------------------------------------------------+--------------------------------------------------+-----+----------+
|                                              text|                                          features|label|  category|
+--------------------------------------------------+--------------------------------------------------+-----+----------+
| PREOPERATIVE DIAGNOSES 1. EMG-proven left carp...|(10000,[86,95,138,157,164,186,228,264,269,278,2...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSIS: Bunion, left foot. POS...|(10000,[47,63,86,120,130,138,157,164,174,178,19...|  0.0|orthopedic|
| RICE stands for the most important elements of...|(10000,[7,57,78,86,222,316,354,374,387,415,427,...|  0.0|orthopedic|
| The patient is an 84-year-old retired male who...|(10000,[47,49,63,79,130,144,157,164,174,199,209...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSES: 1. Left carpal tunnel ...|(10000,[95,122,130,131,157,165,266,278,286,328,...|  0.0|orthopedic|
| PREOPERATIVE DIAGNOSIS: Hernia

In [16]:
(trainingData, testData) = processed_tf.randomSplit([0.8, 0.2], seed = 42)

In [None]:
preds_df = predictions.select('category','text',"prediction",'label').toPandas()

print (classification_report(preds_df['label'], preds_df['prediction']))
print (accuracy_score(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75        30
         1.0       0.68      0.82      0.75        34
         2.0       0.97      0.91      0.94        32
         3.0       0.60      0.53      0.56        17
         4.0       1.00      0.92      0.96        26

    accuracy                           0.81       139
   macro avg       0.80      0.78      0.79       139
weighted avg       0.81      0.81      0.81       139

0.8057553956834532


In [None]:
#preds_df1 = predictions_cv.select("label").toPandas()

preds = pd.DataFrame(confusion_matrix(list(preds_df.label.astype(int)), list(preds_df.prediction.astype(int))), columns = np.unique(preds_df1['label']), index =  np.unique(preds_df1['label']))

preds

Unnamed: 0,0.0,1.0,2.0,3.0,4.0
0.0,22,6,0,2,0
1.0,2,28,0,4,0
2.0,0,3,29,0,0
3.0,4,4,0,9,0
4.0,1,0,1,0,24


# Random Forest with TFIDF



In [17]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 500, \
                            maxDepth = 16, \
                            maxBins = 64)

# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions_rf = rfModel.transform(testData)

In [None]:
preds_df = predictions_rf.select('category','text',"prediction",'label').toPandas()

print (classification_report(preds_df['label'], preds_df['prediction']))
print (accuracy_score(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.65      0.73      0.69        30
         1.0       0.61      0.79      0.69        34
         2.0       0.90      0.81      0.85        32
         3.0       0.57      0.47      0.52        17
         4.0       1.00      0.69      0.82        26

    accuracy                           0.73       139
   macro avg       0.75      0.70      0.71       139
weighted avg       0.75      0.73      0.73       139

0.7266187050359713


# Clinical Embeddings + LogReg Model

In [None]:
clinical_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["document", "lemma"])\
    .setOutputCol("embeddings")\
    .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCols(["finished_sentence_embeddings"]) \
    .setOutputAsVector(True)\
    .setCleanAnnotations(False)

explodeVectors = SQLTransformer(statement=
      "SELECT EXPLODE(finished_sentence_embeddings) AS features, * FROM __THIS__")

nlp_pipeline_w2v = Pipeline(
    stages=[documentAssembler,
            tokenizer,
            normalizer,
            stopwords_cleaner,
            lemmatizer,
            clinical_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            explodeVectors,
            label_stringIdx])

nlp_model_w2v = nlp_pipeline_w2v.fit(spark_df)

processed_w2v = nlp_model_w2v.transform(spark_df)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [None]:
(trainingData, testData) = processed_w2v.randomSplit([0.8, 0.2], seed = 42)

In [None]:
lrModel_w2v = lr.fit(trainingData)

predictions_w2v = lrModel_w2v.transform(testData)

In [None]:
preds_df = predictions_w2v.select('category','text',"prediction",'label').toPandas()

print (classification_report(preds_df['label'], preds_df['prediction']))
print (accuracy_score(preds_df['label'], preds_df['prediction']))

              precision    recall  f1-score   support

         0.0       0.82      0.94      0.87        33
         1.0       0.90      0.47      0.62        38
         2.0       0.75      0.83      0.79        29
         3.0       0.46      0.65      0.54        17
         4.0       0.64      0.73      0.68        22

    accuracy                           0.72       139
   macro avg       0.71      0.72      0.70       139
weighted avg       0.75      0.72      0.71       139

0.7194244604316546


# ClassifierDL with BioBert Embeddings

In [None]:
(trainingData, testData) = spark_df.randomSplit([0.8, 0.2], seed = 42)

print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 679
Test Dataset Count: 139


In [None]:
bert_embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased", "en", "public/models")\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")\

embeddingsSentence = SentenceEmbeddings() \
    .setInputCols(["document", "embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setBatchSize(8)\
    .setMaxEpochs(64)\
    .setLr(0.0003)\
    .setEnableOutputLogs(True)

biobert_clf_pipeline = Pipeline(
    stages = [
        documentAssembler,
        tokenizer,
        bert_embeddings,
        embeddingsSentence,
        classifierdl
    ])

biobert_pubmed_base_cased download started this may take some time.
Approximate size to download 386.4 MB
[OK!]


In [None]:
%%time 

biobert_clf_model = biobert_clf_pipeline.fit(trainingData)

CPU times: user 2.57 s, sys: 306 ms, total: 2.88 s
Wall time: 8min 30s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 16
-rw-r--r-- 1 root root 4523 Jan 30 15:45 ClassifierDLApproach_8adb8d569980.log
-rw-r--r-- 1 root root 4528 Jan 30 15:15 ClassifierDLApproach_3766b9ecc866.log


In [None]:
#!cat ~/annotator_logs/ClassifierDLApproach_a823ba370d3c.log

In [None]:
!tail -n 5 ~/annotator_logs/ClassifierDLApproach_8adb8d569980.log

Epoch 59/64 - 0.45s - loss: 134.68219 - acc: 0.30994898 - batches: 85
Epoch 60/64 - 0.45s - loss: 134.67667 - acc: 0.30994898 - batches: 85
Epoch 61/64 - 0.45s - loss: 134.67117 - acc: 0.30994898 - batches: 85
Epoch 62/64 - 0.45s - loss: 134.66588 - acc: 0.30994898 - batches: 85
Epoch 63/64 - 0.46s - loss: 134.66058 - acc: 0.30994898 - batches: 85


In [None]:
preds = biobert_clf_model.transform(testData)

preds_df = preds.select("category","text","class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['category'], preds_df['result']))

print (accuracy_score(preds_df['category'], preds_df['result']))

                  precision    recall  f1-score   support

gastroenterology       0.00      0.00      0.00        32
       neurology       0.00      0.00      0.00        17
      orthopedic       0.22      0.97      0.36        30
       radiology       0.43      0.09      0.15        34
         urology       0.00      0.00      0.00        26

        accuracy                           0.23       139
       macro avg       0.13      0.21      0.10       139
    weighted avg       0.15      0.23      0.11       139

0.2302158273381295


# ClassifierDL with Clinical Embeddings


In [None]:
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setBatchSize(8)\
  .setMaxEpochs(200)\
  .setLr(0.003)\
  .setEnableOutputLogs(True)

clf_pipeline = Pipeline(
    stages=[documentAssembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            lemmatizer, 
            clinical_embeddings,
            embeddingsSentence,
            classsifierdl])

In [None]:
%%time

clf_pipelineModel = clf_pipeline.fit(trainingData)

preds_clf = clf_pipelineModel.transform(testData)

CPU times: user 1.32 s, sys: 228 ms, total: 1.55 s
Wall time: 2min 42s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 16
-rw-r--r-- 1 root root 14306 Jan 30 18:58 ClassifierDLApproach_2174e13a3327.log


In [None]:
!tail -n 5 ~/annotator_logs/ClassifierDLApproach_2174e13a3327.log

Epoch 195/200 - 0.67s - loss: 105.74354 - acc: 0.6660289 - batches: 85
Epoch 196/200 - 0.64s - loss: 105.740166 - acc: 0.6660289 - batches: 85
Epoch 197/200 - 0.66s - loss: 105.7368 - acc: 0.6660289 - batches: 85
Epoch 198/200 - 0.65s - loss: 105.73342 - acc: 0.6660289 - batches: 85
Epoch 199/200 - 0.63s - loss: 105.73008 - acc: 0.6660289 - batches: 85


In [None]:
preds_df = preds_clf.select("category","text","class.result").toPandas()

preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

print (classification_report(preds_df["category"], preds_df["result"]))

print (accuracy_score(preds_df['category'], preds_df['result']))


                  precision    recall  f1-score   support

gastroenterology       0.58      0.88      0.70        32
       neurology       0.48      0.59      0.53        17
      orthopedic       0.68      0.87      0.76        30
       radiology       0.72      0.68      0.70        34
         urology       0.00      0.00      0.00        26

        accuracy                           0.63       139
       macro avg       0.49      0.60      0.54       139
    weighted avg       0.52      0.63      0.56       139

0.6258992805755396


# ClassifierDL with Universal Sentence Embeddings 

In [None]:
use = UniversalSentenceEncoder.pretrained()\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("class")\
    .setLabelColumn("category")\
    .setBatchSize(8)\
    .setMaxEpochs(250)\
    .setLr(0.003)\
    .setEnableOutputLogs(True)\

use_clf_pipeline = Pipeline(
    stages = [
        documentAssembler,
        use,
        classsifierdl
    ])


tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
%%time
use_clf_pipelineModel = use_clf_pipeline.fit(trainingData)

CPU times: user 597 ms, sys: 64.3 ms, total: 661 ms
Wall time: 2min 7s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 40
-rw-r--r-- 1 root root 17599 Jan 30 10:28 ClassifierDLApproach_834b0a48e47e.log
-rw-r--r-- 1 root root 17814 Jan 30 10:24 ClassifierDLApproach_02e9e77f91d7.log


In [None]:
!tail -n 5 ~/annotator_logs/ClassifierDLApproach_834b0a48e47e.log

Epoch 245/250 - 0.47s - loss: 97.41925 - acc: 0.8244048 - batches: 85
Epoch 246/250 - 0.48s - loss: 97.416885 - acc: 0.8244048 - batches: 85
Epoch 247/250 - 0.47s - loss: 97.41458 - acc: 0.8244048 - batches: 85
Epoch 248/250 - 0.47s - loss: 97.41225 - acc: 0.8244048 - batches: 85
Epoch 249/250 - 0.48s - loss: 97.40986 - acc: 0.8244048 - batches: 85


In [None]:
preds = use_clf_pipelineModel.transform(testData)

preds_df = preds.select("category","text","class.result").toPandas()

preds_df["result"] = preds_df["result"].apply(lambda x : x[0])

print (classification_report(preds_df["category"], preds_df["result"]))

print (accuracy_score(preds_df['category'], preds_df['result']))

                  precision    recall  f1-score   support

gastroenterology       0.83      0.78      0.81        32
       neurology       0.69      0.65      0.67        17
      orthopedic       0.76      0.87      0.81        30
       radiology       0.79      0.76      0.78        34
         urology       0.85      0.85      0.85        26

        accuracy                           0.79       139
       macro avg       0.78      0.78      0.78       139
    weighted avg       0.79      0.79      0.79       139

0.7913669064748201


In [None]:
use_clf_pipelineModel.stages

[DocumentAssembler_42a4fb3b1d99,
 UNIVERSAL_SENTENCE_ENCODER_4de71669b7ec,
 ClassifierDLModel_f58a7605b396]

In [None]:
use_clf_pipelineModel.stages[2].write().overwrite().save("/content/drive/MyDrive/SparkNLPTask/use_clf_e500_b8_lr003")

# ClassifierDL with BertSentenceEmbeddings



In [None]:
bert_sent = BertSentenceEmbeddings.pretrained("sent_small_bert_L12_768")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(10)\
  .setBatchSize(8)\
  .setLr(0.0004)\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        documentAssembler,
        bert_sent,
        classsifierdl
    ])

sent_small_bert_L12_768 download started this may take some time.
Approximate size to download 392.9 MB
[OK!]


In [None]:
classsifierdl.getDropout()

0.5

In [None]:
%%time 

bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)

CPU times: user 3.52 s, sys: 434 ms, total: 3.96 s
Wall time: 9min 51s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 4
-rw-r--r-- 1 root root 800 Feb  7 14:48 ClassifierDLApproach_0d2373912dd1.log


In [None]:
!tail -n 5 ~/annotator_logs/ClassifierDLApproach_0d2373912dd1.log

Epoch 5/10 - 0.59s - loss: 122.12793 - acc: 0.45897108 - batches: 85
Epoch 6/10 - 0.59s - loss: 121.756996 - acc: 0.45897108 - batches: 85
Epoch 7/10 - 0.85s - loss: 121.463005 - acc: 0.45897108 - batches: 85
Epoch 8/10 - 0.61s - loss: 121.214775 - acc: 0.45897108 - batches: 85
Epoch 9/10 - 0.59s - loss: 121.000824 - acc: 0.46343535 - batches: 85


In [None]:
preds = bert_clf_pipeline.transform(testData)

preds_df = preds.select('category','text',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['category']))

print (accuracy_score(preds_df['category'], preds_df['result']))

                  precision    recall  f1-score   support

gastroenterology       0.03      1.00      0.06         1
       neurology       0.00      0.00      0.00         0
      orthopedic       0.83      0.33      0.48        75
       radiology       0.94      0.51      0.66        63
         urology       0.00      0.00      0.00         0

        accuracy                           0.42       139
       macro avg       0.36      0.37      0.24       139
    weighted avg       0.88      0.42      0.56       139

0.4172661870503597


# ClassifierDL with Elmo Embeddings

In [None]:
elmo_embeddings = ElmoEmbeddings.pretrained()\
      .setPoolingLayer("word_emb")\
      .setInputCols(["document",'cleanTokens'])\
      .setOutputCol("elmo")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "elmo"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")    

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setBatchSize(8)\
  .setLr(0.0004)\
  .setEnableOutputLogs(True)

elmo_clf_pipeline = Pipeline(
stages=[documentAssembler, 
            tokenizer, 
            elmo_embeddings,
            embeddingsSentence,
            embeddings_finisher,
            classsifierdl])

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [None]:
processed_trainingData = elmo_clf_pipeline.fit(trainingData)

In [None]:
!cd ~/annotator_logs/ && ls -lt

total 20
-rw-r--r-- 1 root root  444 Jan 30 16:25 ClassifierDLApproach_961ff3b17026.log
-rw-r--r-- 1 root root 4523 Jan 30 15:45 ClassifierDLApproach_8adb8d569980.log
-rw-r--r-- 1 root root 4528 Jan 30 15:15 ClassifierDLApproach_3766b9ecc866.log


In [None]:
!tail -n 5 ~/annotator_logs/ClassifierDLApproach_961ff3b17026.log

Epoch 0/5 - 0.71s - loss: 136.74052 - acc: 0.3622449 - batches: 85
Epoch 1/5 - 0.59s - loss: 136.43806 - acc: 0.43962583 - batches: 85
Epoch 2/5 - 0.60s - loss: 136.27179 - acc: 0.442602 - batches: 85
Epoch 3/5 - 0.60s - loss: 136.14882 - acc: 0.44557822 - batches: 85
Epoch 4/5 - 0.57s - loss: 136.05234 - acc: 0.44409013 - batches: 85


In [None]:
preds = processed_trainingData.transform(testData)

preds_df = preds.select('category','text',"class.result").toPandas()

preds_df['result'] = preds_df['result'].apply(lambda x : x[0])

print (classification_report(preds_df['result'], preds_df['category']))

print (accuracy_score(preds_df['category'], preds_df['result']))

                  precision    recall  f1-score   support

gastroenterology       0.00      0.00      0.00         0
       neurology       0.00      0.00      0.00         0
      orthopedic       0.83      0.32      0.47        77
       radiology       0.91      0.50      0.65        62
         urology       0.00      0.00      0.00         0

        accuracy                           0.40       139
       macro avg       0.35      0.16      0.22       139
    weighted avg       0.87      0.40      0.55       139

0.4028776978417266


# Prediction Pipeline

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

use = UniversalSentenceEncoder.pretrained()\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

loaded_clf_model = ClassifierDLModel.load("/content/drive/MyDrive/SparkNLPTask/use_clf_e500_b8_lr003")\

prediction_pipeline = Pipeline(stages = [
      documentAssembler,
      use,
      loaded_clf_model,
  ])

empty_data = spark.createDataFrame([['']]).toDF("text")

prediction_model = prediction_pipeline.fit(empty_data)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
test = spark.sparkContext.wholeTextFiles("/content/drive/MyDrive/unlabeled-test-data/*.txt")
spark_test = test.toDF(schema=["path","text"])

In [None]:
spark_test = spark_test.select("text")

In [None]:
test = pipelineModel.transform(spark_test)

test = test.select(F.explode('normalizedDocument.result').alias('text'))

test.write.parquet("/content/drive/MyDrive/SparkNLPTask/mtsamples_test.parquet")

In [None]:
test = spark.read.parquet("/content/drive/MyDrive/JSLTask/mtsamples_test.parquet")

test = test.filter(test.text != "")

test.show(truncate=100)

+----------+----------------------------------------------------------------------------------------------------+
|  category|                                                                                                text|
+----------+----------------------------------------------------------------------------------------------------+
|orthopedic| PREOPERATIVE DIAGNOSES 1. EMG-proven left carpal tunnel syndrome. 2. Tenosynovitis of the left t...|
|orthopedic| PREOPERATIVE DIAGNOSIS: Bunion, left foot. POSTOPERATIVE DIAGNOSIS: Bunion, left foot. PROCEDURE...|
|orthopedic| RICE stands for the most important elements of treatment for many injuries---rest, ice, compress...|
|orthopedic| The patient is an 84-year-old retired male who is referred to our office by Dr. O. He comes in t...|
|orthopedic| PREOPERATIVE DIAGNOSES: 1. Left carpal tunnel syndrome (354.0). 2. Left ulnar nerve entrapment a...|
|orthopedic| PREOPERATIVE DIAGNOSIS: Herniated nucleus pulposus T8-T9. POSTOPERATIVE DIA

In [None]:
pd.set_option("display.max_colwidth", 500)

preds = prediction_model.transform(test)

result_df = preds.select("text","class.result").toPandas()

result_df.head(30)

Unnamed: 0,text,result
0,"PREOPERATIVE DIAGNOSES 1. EMG-proven left carpal tunnel syndrome. 2. Tenosynovitis of the left third and fourth fingers at the A1 and A2 pulley level. 3. Dupuytren's nodule in the palm. POSTOPERATIVE DIAGNOSES 1. EMG-proven left carpal tunnel syndrome. 2. Tenosynovitis of the left third and fourth fingers at the A1 and A2 pulley level. 3. Dupuytren's nodule in the palm. PROCEDURE: Left carpal tunnel release with flexor tenosynovectomy; cortisone injection of trigger fingers, left third and ...",[orthopedic]
1,"PREOPERATIVE DIAGNOSIS: Bunion, left foot. POSTOPERATIVE DIAGNOSIS: Bunion, left foot. PROCEDURE PERFORMED: 1. Bunionectomy with first metatarsal osteotomy base wedge type with internal screw fixation. 2. Akin osteotomy with internal wire fixation of left foot. HISTORY: This 19-year-old Caucasian female presents to ABCD General Hospital with the above chief complaint. The patient states she has had worsening bunion deformity for as long as she could not remember. She does have a history of ...",[orthopedic]
2,"RICE stands for the most important elements of treatment for many injuries---rest, ice, compression, and elevation. REST: Stop using the injured part as soon as you realize that an injury has taken place. Use crutches to avoid bearing weight on injuries of the foot, ankle, knee, or leg. Use splints for injuries of the hand, wrist, elbow, or arm. Continued exercise or activity could cause further injury, increased pain, or a delay in healing. ICE: Ice helps stop bleeding from injured blood v...",[orthopedic]
3,"The patient is an 84-year-old retired male who is referred to our office by Dr. O. He comes in today with the chief complaint of low back pain which started about six to eight months ago. He states that he does live here and also travels between here and Iowa and he does have a family in Iowa, which he is very active with his grandchildren doing shopping and plenty of walking. He also recently cut down some trees. He states that he started noticing some pain in his back and his hips and dif...",[orthopedic]
4,PREOPERATIVE DIAGNOSES: 1. Left carpal tunnel syndrome (354.0). 2. Left ulnar nerve entrapment at the elbow (354.2). POSTOPERATIVE DIAGNOSES: 1. Left carpal tunnel syndrome (354.0). 2. Left ulnar nerve entrapment at the elbow (354.2). OPERATIONS PERFORMED: 1. Left carpal tunnel release (64721). 2. Left ulnar nerve anterior submuscular transposition at the elbow (64718). 3. Lengthening of the flexor pronator muscle mass in the proximal forearm to accommodate the submuscular position of the u...,[orthopedic]
5,"PREOPERATIVE DIAGNOSIS: Herniated nucleus pulposus T8-T9. POSTOPERATIVE DIAGNOSIS: Herniated nucleus pulposus T8-T9. OPERATION PERFORMED: Thoracic right-sided discectomy at T8-T9. BRIEF HISTORY AND INDICATION FOR OPERATION: The patient is a 53-year-old female with a history of right thoracic rib pain related to a herniated nucleus pulposus at T8-T9. She has failed conservative measures and sought operative intervention for relief of her symptoms. For details of workup, please see the dictat...",[orthopedic]
6,"This is a 30-year-old female with pain and swelling, status post injury. FINDINGS: There are posttraumatic cysts along the volar midline and volar lateral aspects of the lunate which are likely posttraumatic. There is no acute marrow edema (series #12 images #5-7). Marrow signal is otherwise normal in the distal radius and ulna, throughout the carpals and throughout the proximal metacarpals. There is a partial tear of the volar component of the scapholunate ligament in the region of the pos...",[radiology]
7,"PREOPERATIVE DIAGNOSES: 1. Left diabetic foot abscess and infection. 2. Left calcaneus fracture with infection. 3. Right first ray amputation. POSTOP DIAGNOSES: 1. Left diabetic foot abscess and infection. 2. Left calcaneus fracture with infection. 3. Right first ray amputation. OPERATION AND PROCEDURE: 1. Left below-the-knee amputation. 2. Dressing change, right foot. ANESTHESIA: General. BLOOD LOSS: Less than 100 mL. TOURNIQUET TIME: 24 minutes on the left, 300 mmHg. COMPLICATIONS: None. ...",[orthopedic]
8,"PREOPERATIVE DIAGNOSES: 1. Mass, left second toe. 2. Tumor. 3. Left hallux bone invasion of the distal phalanx. POSTOPERATIVE DIAGNOSES: 1. Mass, left second toe. 2. Tumor. 3. Left hallux with bone invasion of the distal phalanx. PROCEDURE PERFORMED: 1. Excision of mass, left second toe. 2. Distal Syme's amputation, left hallux with excisional biopsy. HISTORY: This 47-year-old Caucasian male presents to ABCD General Hospital with a history of tissue mass on his left foot. The patient states...",[orthopedic]
9,"PREOPERATIVE DIAGNOSIS: Carpal tunnel syndrome. POSTOPERATIVE DIAGNOSIS: Carpal tunnel syndrome. TITLE OF PROCEDURE: Open carpal tunnel release. COMPLICATIONS: None. PROCEDURE IN DETAIL: After administering appropriate antibiotics and general anesthesia the Left upper extremity was prepped and draped in the usual standard fashion. The arm was exsanguinated with Esmarch, and the tourniquet inflated to 250 mmHg. A longitudinal incision was made in line with the 4th ray. The dissection was car...",[orthopedic]
