In [None]:
!pip install johnsnowlabs

In [2]:
from johnsnowlabs import nlp

text = '@CKL_IT says that #normalizers are pretty useful to clean #structured_strings in #NLU like tweets'

nlp.load('norm').predict(text)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]


Unnamed: 0,norm,token
0,CKLIT,@CKL_IT
0,says,says
0,that,that
0,normalizers,#normalizers
0,are,are
0,pretty,pretty
0,useful,useful
0,to,to
0,clean,clean
0,structuredstrings,#structured_strings


In [5]:
text = """John and Peter are brothers.
          However they don't support each other that much.
          John is 20 years old and Peter is 26"""

nlp.load("norm").predict(text)

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]


Unnamed: 0,norm,token
0,John,John
0,and,and
0,Peter,Peter
0,are,are
0,brothers,brothers
0,However,.
0,they,However
0,dont,they
0,support,don't
0,each,support


In [4]:
text = """John and Peter are brothers.
          However they don't support each other that much.
          John is 20 years old and Peter is 26"""

nlp.load('ner').predict(text)

onto_recognize_entities_sm download started this may take some time.
Approx size to download 159 MB
[OK!]


Unnamed: 0,document,entities,entities_class,entities_confidence,entities_origin_chunk,entities_origin_sentence,sentence_pragmatic,word_embedding_embeddings
0,John and Peter are brothers. \n Howev...,John,PERSON,0.999,0,0,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."
0,John and Peter are brothers. \n Howev...,Peter,PERSON,0.9849,1,0,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."
0,John and Peter are brothers. \n Howev...,John,PERSON,0.9988,2,2,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."
0,John and Peter are brothers. \n Howev...,20 years old,DATE,0.83563334,3,2,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."
0,John and Peter are brothers. \n Howev...,Peter,PERSON,0.9977,4,2,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."
0,John and Peter are brothers. \n Howev...,26,DATE,0.6028,5,2,"[John and Peter are brothers., However they do...","[[-0.2747400104999542, 0.48680999875068665, -0..."


In [9]:
!pip install -q pyspark spark-nlp

In [7]:

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import functions as F

spark = sparknlp.start()
spark




## CleanupPatterns
If we don't set CleanupPatterns, it will only keep alphabet letters ([^A-Za-z])

In [12]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")

pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are brothers. However they don't support each other that much. John is 20 years old and Peter is 26"]]).toDF("text")

result = pipeline.fit(data).transform(data)

result.selectExpr("normalized.result").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------+
|[John, and, Peter, are, brothers, However, they, dont, support, each, other, that, much, John, is, years, old, and, Peter, is]|
+------------------------------------------------------------------------------------------------------------------------------+



##  specifying CleanupPatterns

In [13]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized") \
  .setCleanupPatterns(["""[^\w\d\s]"""]) # removes all non-word, non-digit and non-space characters.

pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are brothers. However they don't support each other that much. John is 20 years old and Peter is 26"]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("normalized.result").show(truncate = False)


+--------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                |
+--------------------------------------------------------------------------------------------------------------------------------------+
|[John, and, Peter, are, brothers, However, they, dont, support, each, other, that, much, John, is, 20, years, old, and, Peter, is, 26]|
+--------------------------------------------------------------------------------------------------------------------------------------+



## Lowercase

In [15]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized") \
.setLowercase(True)


pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are brothers. However they don't support each other that much. John is 20 years old and Peter is 26"]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("normalized.result").show(truncate = False)


+------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------+
|[john, and, peter, are, brothers, however, they, dont, support, each, other, that, much, john, is, years, old, and, peter, is]|
+------------------------------------------------------------------------------------------------------------------------------+



## MaxLength and MinLength
Sets the maximum and minimum allowed length for each token.

In [16]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized") \
.setLowercase(True) \
.setMaxLength(4)\
.setMinLength(3)

pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are brothers. However they don't support each other that much. John is 20 years old and Peter is 26"]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("normalized.result").show(truncate = False)

+--------------------------------------------------------------+
|result                                                        |
+--------------------------------------------------------------+
|[john, and, are, they, dont, each, that, much, john, old, and]|
+--------------------------------------------------------------+



## SlangDictionary
Give delimited file with list of custom words to be manually corrected

In [18]:
import csv

field_names = ['Slang', 'Correct_Word']

slangs = [
{'Slang': "bros", 'Correct_Word': 'brothers'},
{'Slang': "approx", 'Correct_Word': 'approximately'},
{'Slang': "AFAIK", 'Correct_Word': 'As far as I know'}
]

with open("slangs.csv", "w") as csvfile:
  writer = csv.DictWriter(csvfile, fieldnames=field_names)
  writer.writeheader()
  writer.writerows(slangs)




In [20]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized") \
.setSlangDictionary('/content/slangs.csv', delimiter=',')

pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are bros. However they don't support each other that much. AFAIK, John is 20 years old and Peter is approx 26"]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("normalized.result").show(truncate = False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                             |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[John, and, Peter, are, brothers, However, they, dont, support, each, other, that, much, As, far, as, I, know, John, is, years, old, and, Peter, is, approximately]|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+



## SlangMatchCase
Whether or not to be case sensitive to match slangs (Default: false)

In [22]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized") \
.setSlangDictionary('/content/slangs.csv', delimiter=',') \
.setSlangMatchCase(True)

pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John and Peter are bros. However they don't support each other that much. afaik, John is 20 years old and Peter is approx 26"]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)
result.selectExpr("normalized.result").show(truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                              |
+----------------------------------------------------------------------------------------------------------------------------------------------------+
|[John, and, Peter, are, brothers, However, they, dont, support, each, other, that, much, afaik, John, is, years, old, and, Peter, is, approximately]|
+----------------------------------------------------------------------------------------------------------------------------------------------------+



As SlangMatchCase is True, non case sensitive slangs were not matched.

afaik -> afaik (Remains Same)

## Token Indicies are preserved

In [23]:
documenter = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
.setInputCols(["token"])\
.setOutputCol("normalized")


pipeline = Pipeline(stages=[
    documenter,
    tokenizer,
    normalizer
])

data = spark.createDataFrame([["John is 20 and Peter is 26 years old."]]) \
    .toDF("text")
result = pipeline.fit(data).transform(data)

result.select("token.result","token.begin","token.end").show(truncate=False)
result.select("normalized.result","normalized.begin","normalized.end").withColumnRenamed("result","normalized result").show(truncate=False)



+-------------------------------------------------+-------------------------------------+-------------------------------------+
|result                                           |begin                                |end                                  |
+-------------------------------------------------+-------------------------------------+-------------------------------------+
|[John, is, 20, and, Peter, is, 26, years, old, .]|[0, 5, 8, 11, 15, 21, 24, 27, 33, 36]|[3, 6, 9, 13, 19, 22, 25, 31, 35, 36]|
+-------------------------------------------------+-------------------------------------+-------------------------------------+

+--------------------------------------+--------------------------+--------------------------+
|normalized result                     |begin                     |end                       |
+--------------------------------------+--------------------------+--------------------------+
|[John, is, and, Peter, is, years, old]|[0, 5, 11, 15, 21, 27, 33]|[3, 6, 