In [1]:
!pip install pyspark spark-nlp

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spark-nlp
  Downloading spark_nlp-5.5.0-py2.py3-none-any.whl.metadata (19 kB)
Downloading spark_nlp-5.5.0-py2.py3-none-any.whl (620 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.8/620.8 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=93bea76f13fd3bbb932724dbb74fe23d53a3f244fc3e359a6c14ee6ee4464f43
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: spark-nlp, pyspark
Successfully installed pysp

In [2]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

spark = sparknlp.start()

In [4]:
from pyspark.sql.types import StringType

content = "1. T1-T2 DATE**[12/24/13] $1.99 () (10/12) ph+ 90%"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

df.show(truncate=False)

+--------------------------------------------------+
|text                                              |
+--------------------------------------------------+
|1. T1-T2 DATE**[12/24/13] $1.99 () (10/12) ph+ 90%|
+--------------------------------------------------+



In [8]:
pattern = '\\s+|(?=[-.:;"*+,$&?!%\\[\\]\\(\\)\\/])|(?<=[-.:;"*+,$&?!%\\[\\]\\(\\)\\/])'

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")\

tokenizer = RegexTokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("RegexToken")

regex_tokenizer = RegexTokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("Regextoken_with_pattern")\
    .setPattern(pattern)\

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, regex_tokenizer])

result = pipeline.fit(df).transform(df)

result.selectExpr("RegexToken.result as Regextoken", "Regextoken_with_pattern.result as Regextoken_with_pattern").show(truncate=False)


+-----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+
|Regextoken                                                 |Regextoken_with_pattern                                                                                |
+-----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+
|[1., T1-T2, DATE**[12/24/13], $1.99, (), (10/12), ph+, 90%]|[1, ., T1, -, T2, DATE, *, *, [, 12, /, 24, /, 13, ], $, 1, ., 99, (, ), (, 10, /, 12, ), ph, +, 90, %]|
+-----------------------------------------------------------+-------------------------------------------------------------------------------------------------------+



Regextokenizer created the tokens by dividing using "/s+" when no pattern was given. When a pattern was given to the setPattern parameter, it performed the separation using that pattern.

In [9]:
tokenizer.extractParamMap()

{Param(parent='RegexTokenizer_dac73b2b1c11', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='RegexTokenizer_dac73b2b1c11', name='inputCols', doc='previous annotations columns, if renamed'): ['sentence'],
 Param(parent='RegexTokenizer_dac73b2b1c11', name='outputCol', doc='output annotation column. can be left default.'): 'RegexToken',
 Param(parent='RegexTokenizer_dac73b2b1c11', name='toLowercase', doc='Indicates whether to convert all characters to lowercase before tokenizing.'): False,
 Param(parent='RegexTokenizer_dac73b2b1c11', name='minLength', doc='Set the minimum allowed length for each token'): 1,
 Param(parent='RegexTokenizer_dac73b2b1c11', name='pattern', doc='regex pattern used for tokenizing. Defaults \\S+'): '\\s+',
 Param(parent='RegexTokenizer_dac73b2b1c11', name='positionalMask', doc='Using a positional mask to guarantee the incremental progression of the tokenization.'): False,
 Param(parent='RegexTokeni

In [12]:
regex_tokenizer.extractParamMap()

{Param(parent='RegexTokenizer_a30e6dab1241', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='RegexTokenizer_a30e6dab1241', name='inputCols', doc='previous annotations columns, if renamed'): ['sentence'],
 Param(parent='RegexTokenizer_a30e6dab1241', name='outputCol', doc='output annotation column. can be left default.'): 'Regextoken_with_pattern',
 Param(parent='RegexTokenizer_a30e6dab1241', name='toLowercase', doc='Indicates whether to convert all characters to lowercase before tokenizing.'): False,
 Param(parent='RegexTokenizer_a30e6dab1241', name='minLength', doc='Set the minimum allowed length for each token'): 1,
 Param(parent='RegexTokenizer_a30e6dab1241', name='pattern', doc='regex pattern used for tokenizing. Defaults \\S+'): '\\s+|(?=[-.:;"*+,$&?!%\\[\\]\\(\\)\\/])|(?<=[-.:;"*+,$&?!%\\[\\]\\(\\)\\/])',
 Param(parent='RegexTokenizer_a30e6dab1241', name='positionalMask', doc='Using a positional mask to guarantee t

In [16]:
regex_pattern = """\t"""
sampleText = "   Jack   \t    registered \t with \t   id:7354632112   \t    on    \t      23/3/2022    "

df = spark.createDataFrame([[sampleText]]).toDF("text")

regex_tokenizer = RegexTokenizer()\
    .setInputCols("sentence")\
    .setOutputCol("token")\
    .setPattern(regex_pattern)\
    .setTrimWhitespace(False)

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, regex_tokenizer])

result = pipeline.fit(df).transform(df)
result.selectExpr("token.result as Regextoken").show(truncate=False)

+------------------------------------------------------------------------------------+
|Regextoken                                                                          |
+------------------------------------------------------------------------------------+
|[Jack   ,     registered ,  with ,    id:7354632112   ,     on    ,       23/3/2022]|
+------------------------------------------------------------------------------------+



In [18]:
regex_pattern = """\t"""
sampleText = "   Jack   \t    registered \t with \t   id:7354632112   \t    on    \t      23/3/2022    "

df = spark.createDataFrame([[sampleText]]).toDF("text")

regex_tokenizer = RegexTokenizer()\
    .setInputCols("sentence")\
    .setOutputCol("token")\
    .setPattern(regex_pattern)\
    .setTrimWhitespace(True)\
    .setPreservePosition(False)

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, regex_tokenizer])

result = pipeline.fit(df).transform(df)
result.selectExpr("token.result as Regextoken", "token.begin as Regextoken_begin", "token.end as Regextoken_end").show(truncate=False)

+------------------------------------------------------+-----------------------+-----------------------+
|Regextoken                                            |Regextoken_begin       |Regextoken_end         |
+------------------------------------------------------+-----------------------+-----------------------+
|[Jack, registered, with, id:7354632112, on, 23/3/2022]|[3, 15, 28, 37, 58, 71]|[6, 24, 31, 49, 59, 79]|
+------------------------------------------------------+-----------------------+-----------------------+



In [19]:
regex_pattern = """\t"""
sampleText = "   Jack   \t    registered \t with \t   id:7354632112   \t    on    \t      23/3/2022    "

df = spark.createDataFrame([[sampleText]]).toDF("text")

regex_tokenizer = RegexTokenizer()\
    .setInputCols("sentence")\
    .setOutputCol("token")\
    .setPattern(regex_pattern)\
    .setTrimWhitespace(True)\
    .setPreservePosition(True)

pipeline = Pipeline(stages=[documenter, sentencer, tokenizer, regex_tokenizer])

result = pipeline.fit(df).transform(df)
result.selectExpr("token.result as Regextoken", "token.begin as Regextoken_begin", "token.end as Regextoken_end").show(truncate=False)

+------------------------------------------------------+-----------------------+-----------------------+
|Regextoken                                            |Regextoken_begin       |Regextoken_end         |
+------------------------------------------------------+-----------------------+-----------------------+
|[Jack, registered, with, id:7354632112, on, 23/3/2022]|[3, 11, 27, 34, 54, 65]|[9, 25, 32, 52, 63, 79]|
+------------------------------------------------------+-----------------------+-----------------------+



In [20]:
from pyspark.sql.types import StringType

content = "1. The investments made reached a value of £4.5Million, gaining __85.6% on DATE**[24/12/2022]."
pattern = "\\s+|(?=[-:;*__+,$&\\[\\]])|(?<=[-:;*__+,$&\\[\\]])"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")


regex_tokenizer = RegexTokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setPattern(pattern)\
    .setToLowercase(True)

pipeline = Pipeline(stages=[documenter, sentencer, regex_tokenizer])

result = pipeline.fit(df).transform(df)
result.selectExpr("sentence.result as sentence","token.result as Regextoken").show(truncate=False)

+------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|sentence                                                                                        |Regextoken                                                                                                                    |
+------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
|[1. The investments made reached a value of £4.5Million, gaining __85.6% on DATE**[24/12/2022].]|[1., the, investments, made, reached, a, value, of, £4.5million, ,, gaining, _, _, 85.6%, on, date, *, *, [, 24/12/2022, ], .]|
+-----------------------------------------------------------------------------------------------

In [22]:
from pyspark.sql.types import StringType

content = "1. The investments made reached a value of £4.5Million, gaining __85.6% on DATE**[24/12/2022]."
pattern = "\\s+|(?=[-:;*__+,$&\\[\\]])|(?<=[-:;*__+,$&\\[\\]])"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

documenter = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")


regex_tokenizer = RegexTokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")\
    .setPattern(pattern)\
    .setToLowercase(True)\
    .setMaxLength(6)\
    .setMinLength(3)

pipeline = Pipeline(stages=[documenter, sentencer, regex_tokenizer])

result = pipeline.fit(df).transform(df)
result.selectExpr("sentence.result as sentence","token.result as Regextoken").show(truncate=False)

+------------------------------------------------------------------------------------------------+-------------------------------+
|sentence                                                                                        |Regextoken                     |
+------------------------------------------------------------------------------------------------+-------------------------------+
|[1. The investments made reached a value of £4.5Million, gaining __85.6% on DATE**[24/12/2022].]|[the, made, value, 85.6%, date]|
+------------------------------------------------------------------------------------------------+-------------------------------+

