In [1]:
!pip install -q pyspark spark-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.5/579.5 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import Lemmatizer, LemmatizerModel, Tokenizer
import pyspark.sql.functions as F


spark = sparknlp.start()

## setDictionary()

External dictionary to be used by the lemmatizer, which needs keyDelimiter (separates lemmas from the word forms) and valueDelimiter (separator between different word forms of the same lemma) for parsing the resource.

In [3]:

!wget -q https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt

In [4]:
!head -5 AntBNC_lemmas_ver_001.txt


aaah	->	aaahed	aaah
aac	->	aac	aacs
aah	->	aah	aahs	aahing	aahed	aahhing
aam	->	aams	aam
aardvark	->	aardvark	aardvarks


In [6]:
lemmatizer = Lemmatizer() \
.setInputCols(["token"]) \
.setOutputCol("lemma") \
.setDictionary("./AntBNC_lemmas_ver_001.txt", key_delimiter = "->", value_delimiter = "\t")

In [8]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

nlpPipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer])

sample_texts = [
    ["I love working with SparkNLP."],
    ["I am living in Canada."]
]

data = spark.createDataFrame(sample_texts).toDF("text")

result = nlpPipeline.fit(data).transform(data)

result.show(truncate=40)

+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|                         text|                                document|                                   token|                                   lemma|
+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|I love working with SparkNLP.|[{document, 0, 28, I love working wit...|[{token, 0, 0, I, {sentence -> 0}, []...|[{token, 0, 0, I, {sentence -> 0}, []...|
|       I am living in Canada.|[{document, 0, 21, I am living in Can...|[{token, 0, 0, I, {sentence -> 0}, []...|[{token, 0, 0, I, {sentence -> 0}, []...|
+-----------------------------+----------------------------------------+----------------------------------------+----------------------------------------+



In [9]:
result.select("token.result").show(truncate=False)

+-------------------------------------+
|result                               |
+-------------------------------------+
|[I, love, working, with, SparkNLP, .]|
|[I, am, living, in, Canada, .]       |
+-------------------------------------+



In [10]:
result.select("lemma.result").show(truncate=False)

+----------------------------------+
|result                            |
+----------------------------------+
|[I, love, work, with, SparkNLP, .]|
|[I, be, live, in, Canada, .]      |
+----------------------------------+



In [11]:

result_df = result.select(F.explode(F.arrays_zip(result.token.result,
                                                 result.lemma.result)).alias("cols")) \
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("lemma")).toPandas()

result_df.head(10)

Unnamed: 0,token,lemma
0,I,I
1,love,love
2,working,work
3,with,with
4,SparkNLP,SparkNLP
5,.,.
6,I,I
7,am,be
8,living,live
9,in,in


##  Using pretrained models
The LemmatizerModel annotator can automatically download pretrained models with the .pretrained() method.

In [12]:
!wget -q -O news_category_test.csv https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/classifier-dl/news_Category/news_category_test.csv


In [13]:
!head -5 news_category_test.csv

category,description
Business,Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
Sci/Tech," TORONTO, Canada    A second team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for privately funded suborbital space flight, has officially announced the first launch date for its manned rocket."
Sci/Tech," A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, which are short chains of amino acids, the building blocks of proteins."
Sci/Tech," It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar."


In [15]:
import pyspark.sql.functions as F

news_df = spark.read\
.option("header", "true")\
.csv("news_category_test.csv")\
.withColumnRenamed("description", "text")

news_df.show(truncate=100)

+--------+----------------------------------------------------------------------------------------------------+
|category|                                                                                                text|
+--------+----------------------------------------------------------------------------------------------------+
|Business|Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stric...|
|Sci/Tech| TORONTO, Canada    A second team of rocketeers competing for the  #36;10 million Ansari X Prize,...|
|Sci/Tech| A company founded by a chemistry researcher at the University of Louisville won a grant to devel...|
|Sci/Tech| It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures an...|
|Sci/Tech| Southern California's smog fighting agency went after emissions of the bovine variety Friday, ad...|
|Sci/Tech|"The British Department for Education and Skills (DfES) recently launched a ""Music Manifesto"

In [16]:
lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
.setInputCols(["token"]) \
.setOutputCol("lemma")



lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [17]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

tokenizer = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

nlpPipeline = Pipeline(stages=[documentAssembler,
                               tokenizer,
                               lemmatizer])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

result = pipelineModel.transform(news_df)

result.show(5)

+--------+--------------------+--------------------+--------------------+--------------------+
|category|                text|            document|               token|               lemma|
+--------+--------------------+--------------------+--------------------+--------------------+
|Business|Unions representi...|[{document, 0, 12...|[{token, 0, 5, Un...|[{token, 0, 5, Un...|
|Sci/Tech| TORONTO, Canada ...|[{document, 0, 22...|[{token, 1, 7, TO...|[{token, 1, 7, TO...|
|Sci/Tech| A company founde...|[{document, 0, 20...|[{token, 1, 1, A,...|[{token, 1, 1, A,...|
|Sci/Tech| It's barely dawn...|[{document, 0, 26...|[{token, 1, 4, It...|[{token, 1, 4, It...|
|Sci/Tech| Southern Califor...|[{document, 0, 17...|[{token, 1, 8, So...|[{token, 1, 8, So...|
+--------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [23]:
result.select("token.result", "lemma.result").show(5, truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                            result|                                            result|
+--------------------------------------------------+--------------------------------------------------+
|[Unions, representing, workers, at, Turner, New...|[Unions, represent, worker, at, Turner, Newall,...|
|[TORONTO, ,, Canada, A, second, team, of, rocke...|[TORONTO, ,, Canada, A, second, team, of, rocke...|
|[A, company, founded, by, a, chemistry, researc...|[A, company, founded, by, a, chemistry, researc...|
|[It's, barely, dawn, when, Mike, Fitzpatrick, s...|[It's, barely, dawn, when, Mike, Fitzpatrick, s...|
|[Southern, California's, smog, fighting, agency...|[Southern, California's, smog, fight, agency, g...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 5 rows

