## DocumentNormalizer

In [2]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.8/620.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [3]:

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import functions as F

spark = sparknlp.start()
spark

In [8]:
text = '''
    <title>spark-nlp-workshop/Spark_NLP_Udemy_MOOC/Open_Source/07.01.DocumentNormalizer.ipynb at master · JohnSnowLabs/spark-nlp-workshop</title>
    <div>THE WORLD'S LARGEST WEB DEVELOPER SITE
    THE WORLD'S LARGEST WEB DEVELOPER SITE
    Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..
    </div>

'''

In [9]:
spark_df = spark.createDataFrame([[text]]).toDF("text")
spark_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                           

## Action
Action to perform applying regex patterns on text, i.e. (clean | extract).

Default Action: "clean"

In [11]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

#default
cleanUpPatterns = ["<[^>]*>"] #remove HTML tags

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument').show(truncate=False)
result.select('normalizedDocument.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|normalizedDocument                                                                                                                                                          

As the default action is clean, it removes the cleanUpPatterns that we defined above. So all HTML tags are removed in this case.

## Action : "extract"

In [13]:
#Download demo data : https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/document-normalizer/xml-docs/C-CDAsample.xml
!mkdir xml-docs
!wget -O xml-docs/demo.xml https://github.com/sichkar-valentyn/XML_files_in_Python/blob/master/example.xml


mkdir: cannot create directory ‘xml-docs’: File exists
--2024-09-25 21:32:03--  https://github.com/sichkar-valentyn/XML_files_in_Python/blob/master/example.xml
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘xml-docs/demo.xml’

xml-docs/demo.xml       [ <=>                ] 165.41K  --.-KB/s    in 0.1s    

2024-09-25 21:32:04 (1.46 MB/s) - ‘xml-docs/demo.xml’ saved [169382]



In [14]:

# Data loading
data = spark.sparkContext.wholeTextFiles("xml-docs")
df = data.toDF(schema=["filename", "text"]).select("text")
df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# Specify the action as extract
action = "extract"

tag = "name"
patterns = [tag]

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement("") \
    .setPolicy("pretty_all") \

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        document_assembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument.result").show(10, False)

+------+
|result|
+------+
|[]    |
+------+



## Lowercase

In [21]:

text = '''

    THE WORLD'S LARGEST WEB DEVELOPER SITE
    THE WORLD'S LARGEST WEB DEVELOPER SITE
    Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..


'''
spark_df = spark.createDataFrame([[text]]).toDF("text")

In [23]:
spark_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                       

In [25]:
cleanuoPatterns = ["<[^>]*>"]

document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanuoPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[document_assembler,documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)

result.select('normalizedDocument.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                                                                                                              

## Patterns

In [29]:
text = '''
    <title>spark-nlp-workshop/Spark_NLP_Udemy_MOOC/Open_Source/07.01.DocumentNormalizer.ipynb at master · JohnSnowLabs/spark-nlp-workshop</title>
    <div>THE WORLD'S LARGEST WEB DEVELOPER SITE
    <p>THE WORLD'S LARGEST WEB DEVELOPER SITE</p>
    Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..
    </div>

'''

In [30]:
spark_df = spark.createDataFrame([[text]]).toDF("text")
spark_df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                    

In [28]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)


pipeline = Pipeline() \
    .setStages([documentAssembler,
                documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                                                                                                              

## After specifying a pattern

In [33]:

#Specify cleanUpPatterns to remove the paragraph tag and its content
cleanUpPatterns = ["(.*?)"]

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Replacement
Replacement string to apply when regexes match (Default: " ")

In [34]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|normalizedDocument                                                                                                                                                          

Here as we can see, Replacement takes it default value (" ")

Using Replacement to obfuscate PII such as ages in HTML content

In [35]:

text = """



w3schools.com
This is a heading
This is a paragraph containing some PII like jonhdoe@myemail.com ! John is now 42 years old.
48% of cardiologists treated patients aged 65+.


 """

In [37]:
df = spark.createDataFrame([[text]]).toDF("text")
df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                          |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|\n\n\n\nw3schools.com\nThis is a heading\nThis is a paragraph containing some PII like jonhdoe@myemail.com ! John is now 42 years old.\n48% of cardiologists treated patients aged 65+.\n\n\n |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [39]:
action = "clean"
patterns = ["\d+(?=[\s]?year)", "(aged)[\s]?\d+"]

replacement = "***OBFUSCATED PII***"

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement) \
    .setPolicy("pretty_all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(df).transform(df)
result.select('normalizedDocument.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                        |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[ w3schools.com this is a heading this is a paragraph containing some pii like jonhdoe@myemail.com ! john is now ***obfuscated pii*** years old. 48% of cardiologists treated patients ***obfuscated pii***+.]|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Policy

RemovalPolicy to remove patterns from text with a given policy (Default: "pretty_all").


Valid policy values are:


* all
* pretty_all
* first
* pretty_first

In [40]:
text = '''
    <title>spark-nlp-workshop/Spark_NLP_Udemy_MOOC/Open_Source/07.01.DocumentNormalizer.ipynb at master · JohnSnowLabs/spark-nlp-workshop</title>
    <div>THE WORLD'S LARGEST WEB DEVELOPER SITE
    <p>THE WORLD'S LARGEST WEB DEVELOPER SITE</p>
    Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum..
    </div>

'''

In [42]:
df = spark.createDataFrame([[text]]).toDF("text")
df.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                    

In [41]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setReplacement(" ") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument.result').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                               

In [43]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setReplacement(" ") \
    .setPolicy("all") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument.result').show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                           

In [44]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setReplacement(" ") \
    .setPolicy("first") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument.result').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                      

In [45]:
documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setReplacement(" ") \
    .setPolicy("pretty_first") \
    .setLowercase(True)

pipeline = Pipeline(stages=[documenter, documentNormalizer])

result = pipeline.fit(spark_df).transform(spark_df)
result.select('normalizedDocument.result').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                    