**Spark NLP Basics and Pretrained Pipelines**



In [1]:
! pip install -q pyspark==3.1.2 spark-nlp

[K     |████████████████████████████████| 212.4 MB 67 kB/s 
[K     |████████████████████████████████| 133 kB 50.5 MB/s 
[K     |████████████████████████████████| 198 kB 45.9 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp

spark = sparknlp.start()

print(f"Soark NLP version: {sparknlp.version()}")
print(f"Apache Spark version: {spark.version}")

Soark NLP version: 3.3.4
Apache Spark version: 3.1.2


In [3]:
! cd ~/.ivy2/cache/com.johnsnowlabs.nlp/spark-nlp_2.12/jars && ls -lt

total 38992
-rw-r--r-- 1 root root 39925796 Nov 25 14:01 spark-nlp_2.12-3.3.4.jar


In [4]:
from sparknlp.pretrained import PretrainedPipeline

In [5]:
testDoc = '''
The misson of the United States Air Frce is to fly, fight and win - airpower anytime, nywhere. Whether full time, part time, in or out of uniorm, everyone who serves plays a critial role in helping us achieve mission sccess.
'''

**Explain Document ML**

In [6]:
pipeline = PretrainedPipeline('explain_document_ml', lang= 'en')

explain_document_ml download started this may take some time.
Approx size to download 9.1 MB
[OK!]


In [7]:
pipeline.model.stages

[document_811d40a38b24,
 SENTENCE_ce56851acebe,
 REGEX_TOKENIZER_78daa3b4692f,
 SPELL_79c88338ef12,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_caf11d1f4d0e,
 POS_dbb704204f6f]

In [8]:
#help(PretrainedPipeline)

In [9]:
%%time
result = pipeline.annotate(testDoc)

CPU times: user 32.3 ms, sys: 9.56 ms, total: 41.9 ms
Wall time: 1.83 s


In [10]:
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [11]:
result['sentence']

['The misson of the United States Air Frce is to fly, fight and win - airpower anytime, nywhere.',
 'Whether full time, part time, in or out of uniorm, everyone who serves plays a critial role in helping us achieve mission sccess.']

In [12]:
result['token']

['The',
 'misson',
 'of',
 'the',
 'United',
 'States',
 'Air',
 'Frce',
 'is',
 'to',
 'fly',
 ',',
 'fight',
 'and',
 'win',
 '-',
 'airpower',
 'anytime',
 ',',
 'nywhere',
 '.',
 'Whether',
 'full',
 'time',
 ',',
 'part',
 'time',
 ',',
 'in',
 'or',
 'out',
 'of',
 'uniorm',
 ',',
 'everyone',
 'who',
 'serves',
 'plays',
 'a',
 'critial',
 'role',
 'in',
 'helping',
 'us',
 'achieve',
 'mission',
 'sccess',
 '.']

In [13]:
list(zip(result['token'],result['pos']))

[('The', 'DT'),
 ('misson', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('Air', 'NNP'),
 ('Frce', 'NNP'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('fly', 'VB'),
 (',', ','),
 ('fight', 'NN'),
 ('and', 'CC'),
 ('win', 'VB'),
 ('-', ':'),
 ('airpower', 'NN'),
 ('anytime', 'RB'),
 (',', ','),
 ('nywhere', 'RB'),
 ('.', '.'),
 ('Whether', 'IN'),
 ('full', 'JJ'),
 ('time', 'NN'),
 (',', ','),
 ('part', 'NN'),
 ('time', 'NN'),
 (',', ','),
 ('in', 'IN'),
 ('or', 'CC'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('uniorm', 'NN'),
 (',', ','),
 ('everyone', 'NN'),
 ('who', 'WP'),
 ('serves', 'VBZ'),
 ('plays', 'VBZ'),
 ('a', 'DT'),
 ('critial', 'JJ'),
 ('role', 'NN'),
 ('in', 'IN'),
 ('helping', 'VBG'),
 ('us', 'PRP'),
 ('achieve', 'VBP'),
 ('mission', 'NN'),
 ('sccess', 'NN'),
 ('.', '.')]

In [14]:
list(zip(result['token'], result['lemmas'], result['stems'], result['spell']))

[('The', 'The', 'the', 'The'),
 ('misson', 'mission', 'mission', 'mission'),
 ('of', 'of', 'of', 'of'),
 ('the', 'the', 'the', 'the'),
 ('United', 'United', 'unit', 'United'),
 ('States', 'States', 'state', 'States'),
 ('Air', 'Air', 'air', 'Air'),
 ('Frce', 'Free', 'free', 'Free'),
 ('is', 'be', 'i', 'is'),
 ('to', 'to', 'to', 'to'),
 ('fly', 'fly', 'fly', 'fly'),
 (',', ',', ',', ','),
 ('fight', 'fight', 'fight', 'fight'),
 ('and', 'and', 'and', 'and'),
 ('win', 'win', 'win', 'win'),
 ('-', '-', '-', '-'),
 ('airpower', 'airpower', 'airpow', 'airpower'),
 ('anytime', 'anytime', 'anytim', 'anytime'),
 (',', ',', ',', ','),
 ('nywhere', 'anywhere', 'anywher', 'anywhere'),
 ('.', '.', '.', '.'),
 ('Whether', 'Whether', 'whether', 'Whether'),
 ('full', 'full', 'full', 'full'),
 ('time', 'time', 'time', 'time'),
 (',', ',', ',', ','),
 ('part', 'part', 'part', 'part'),
 ('time', 'time', 'time', 'time'),
 (',', ',', ',', ','),
 ('in', 'in', 'in', 'in'),
 ('or', 'or', 'or', 'or'),
 ('out',

In [15]:
import pandas as pd
pd.set_option("display.max_rows", 100)

In [16]:
df = pd.DataFrame({'token':result['token'], 
                   'corrected':result['spell'], 'POS':result['pos'],
                   'lemmas':result['lemmas'], 'stems':result['stems']})
df

Unnamed: 0,token,corrected,POS,lemmas,stems
0,The,The,DT,The,the
1,misson,mission,NN,mission,mission
2,of,of,IN,of,of
3,the,the,DT,the,the
4,United,United,NNP,United,unit
5,States,States,NNPS,States,state
6,Air,Air,NNP,Air,air
7,Frce,Free,NNP,Free,free
8,is,is,VBZ,be,i
9,to,to,TO,to,to


**Explain Document DL**

In [17]:
pipeline_dl = PretrainedPipeline('explain_document_dl', lang= 'en')

explain_document_dl download started this may take some time.
Approx size to download 169.4 MB
[OK!]


In [18]:
pipeline_dl.model.stages

[document_7939d5bf1083,
 SENTENCE_05265b07c745,
 REGEX_TOKENIZER_c5c312143f63,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_ba49f7631065,
 POS_d01c734956fe,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_a81db9af2d23]

In [23]:
pipeline_dl.model.stages[-2].getStorageRef()

'glove_100d'

In [29]:
pipeline_dl.model.stages[-2].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [30]:
%%time

result = pipeline_dl.annotate(testDoc)

result.keys()

CPU times: user 38.2 ms, sys: 5.35 ms, total: 43.5 ms
Wall time: 1.49 s


In [31]:
result.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [32]:
result['entities']

['United States Air Frce']

In [34]:
result['lemma']

['The',
 'mission',
 'of',
 'the',
 'United',
 'States',
 'Air',
 'Free',
 'be',
 'to',
 'fly',
 ',',
 'fight',
 'and',
 'win',
 '-',
 'airpower',
 'anytime',
 ',',
 'anywhere',
 '.',
 'Whether',
 'full',
 'time',
 ',',
 'part',
 'time',
 ',',
 'in',
 'or',
 'out',
 'of',
 'uniform',
 ',',
 'everyone',
 'who',
 'serve',
 'play',
 'a',
 'critical',
 'role',
 'in',
 'help',
 'we',
 'achieve',
 'mission',
 'access',
 '.']

In [35]:
df = pd.DataFrame({'token':result['token'], 'ner_label':result['ner'],
                      'spell_corrected':result['checked'], 'POS':result['pos'],
                      'lemmas':result['lemma'], 'stems':result['stem']})

In [36]:
df

Unnamed: 0,token,ner_label,spell_corrected,POS,lemmas,stems
0,The,O,The,DT,The,the
1,misson,O,mission,NN,mission,mission
2,of,O,of,IN,of,of
3,the,O,the,DT,the,the
4,United,B-ORG,United,NNP,United,unit
5,States,I-ORG,States,NNPS,States,state
6,Air,I-ORG,Air,NNP,Air,air
7,Frce,I-ORG,Free,NNP,Free,free
8,is,O,is,VBZ,be,i
9,to,O,to,TO,to,to


**Recognize Entities DL**

In [37]:
recognize_entities = PretrainedPipeline('recognize_entities_dl', lang='en')

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [38]:
recognize_entities.model.stages

[document_1c58bc1aca5d,
 SENTENCE_328d8a47c1a8,
 REGEX_TOKENIZER_b6c4cbc5a4ea,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_389b80afbf7d]

In [41]:
recognize_entities.model.stages[3].getStorageRef()

'glove_100d'

In [43]:
recognize_entities.model.stages[4].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [44]:
testDoc

'\nThe misson of the United States Air Frce is to fly, fight and win - airpower anytime, nywhere. Whether full time, part time, in or out of uniorm, everyone who serves plays a critial role in helping us achieve mission sccess.\n'

In [46]:
result = recognize_entities.annotate(testDoc)

In [47]:
list(zip(result['token'], result['ner']))

[('The', 'O'),
 ('misson', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('United', 'B-ORG'),
 ('States', 'I-ORG'),
 ('Air', 'I-ORG'),
 ('Frce', 'I-ORG'),
 ('is', 'O'),
 ('to', 'O'),
 ('fly', 'O'),
 (',', 'O'),
 ('fight', 'O'),
 ('and', 'O'),
 ('win', 'O'),
 ('-', 'O'),
 ('airpower', 'O'),
 ('anytime', 'O'),
 (',', 'O'),
 ('nywhere', 'O'),
 ('.', 'O'),
 ('Whether', 'O'),
 ('full', 'O'),
 ('time', 'O'),
 (',', 'O'),
 ('part', 'O'),
 ('time', 'O'),
 (',', 'O'),
 ('in', 'O'),
 ('or', 'O'),
 ('out', 'O'),
 ('of', 'O'),
 ('uniorm', 'O'),
 (',', 'O'),
 ('everyone', 'O'),
 ('who', 'O'),
 ('serves', 'O'),
 ('plays', 'O'),
 ('a', 'O'),
 ('critial', 'O'),
 ('role', 'O'),
 ('in', 'O'),
 ('helping', 'O'),
 ('us', 'O'),
 ('achieve', 'O'),
 ('mission', 'O'),
 ('sccess', 'O'),
 ('.', 'O')]

**Clean Stop Words**

In [48]:
clean_stop = PretrainedPipeline('clean_stop', lang='en')

clean_stop download started this may take some time.
Approx size to download 22.8 KB
[OK!]


In [49]:
clean_stop.model.stages # clean stop pipeline stages

[document_90b4be8a6e0b,
 SENTENCE_8ba1e4f73af0,
 REGEX_TOKENIZER_fb4f98b445ce,
 STOPWORDS_CLEANER_b5d381c851f5]

In [50]:
result = clean_stop.annotate(testDoc)
result.keys()

dict_keys(['document', 'sentence', 'token', 'cleanTokens'])

In [51]:
result['cleanTokens']

['misson',
 'United',
 'States',
 'Air',
 'Frce',
 'fly',
 ',',
 'fight',
 'win',
 '-',
 'airpower',
 'anytime',
 ',',
 'nywhere',
 '.',
 'time',
 ',',
 'time',
 ',',
 'uniorm',
 ',',
 'serves',
 'plays',
 'critial',
 'role',
 'helping',
 'achieve',
 'mission',
 'sccess',
 '.']

In [52]:
' '.join(result['cleanTokens'])

'misson United States Air Frce fly , fight win - airpower anytime , nywhere . time , time , uniorm , serves plays critial role helping achieve mission sccess .'

**Spell Checker**

In [53]:
spell_checker = PretrainedPipeline('check_spelling', lang='en')

check_spelling download started this may take some time.
Approx size to download 913.5 KB
[OK!]


In [54]:
result = spell_checker.annotate(testDoc)

In [55]:
result.keys()

dict_keys(['document', 'sentence', 'token', 'checked'])

In [56]:
list(zip(result['token'], result['checked']))

[('The', 'The'),
 ('misson', 'mission'),
 ('of', 'of'),
 ('the', 'the'),
 ('United', 'United'),
 ('States', 'States'),
 ('Air', 'Air'),
 ('Frce', 'Free'),
 ('is', 'is'),
 ('to', 'to'),
 ('fly', 'fly'),
 (',', ','),
 ('fight', 'fight'),
 ('and', 'and'),
 ('win', 'win'),
 ('-', '-'),
 ('airpower', 'airpower'),
 ('anytime', 'anytime'),
 (',', ','),
 ('nywhere', 'anywhere'),
 ('.', '.'),
 ('Whether', 'Whether'),
 ('full', 'full'),
 ('time', 'time'),
 (',', ','),
 ('part', 'part'),
 ('time', 'time'),
 (',', ','),
 ('in', 'in'),
 ('or', 'or'),
 ('out', 'out'),
 ('of', 'of'),
 ('uniorm', 'uniform'),
 (',', ','),
 ('everyone', 'everyone'),
 ('who', 'who'),
 ('serves', 'serves'),
 ('plays', 'plays'),
 ('a', 'a'),
 ('critial', 'critical'),
 ('role', 'role'),
 ('in', 'in'),
 ('helping', 'helping'),
 ('us', 'us'),
 ('achieve', 'achieve'),
 ('mission', 'mission'),
 ('sccess', 'access'),
 ('.', '.')]

**Parsing a list of texts**

In [57]:
testDoc_list = ['French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.']

testDoc_list

['French author who helped pioner the science-fiction genre.',
 'Verne wrate about space, air, and underwater travel before navigable aircrast',
 'Practical submarines were invented, and before any means of space travel had been devised.']

In [58]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

explain_document_ml download started this may take some time.
Approx size to download 9.1 MB
[OK!]


In [59]:
result_list = pipeline.annotate(testDoc_list)

len (result_list)

3

In [61]:
result_list

[{'document': ['French author who helped pioner the science-fiction genre.'],
  'lemmas': ['French',
   'author',
   'who',
   'help',
   'pioneer',
   'the',
   'sciencefiction',
   'genre',
   '.'],
  'pos': ['JJ', 'NN', 'WP', 'VBD', 'NN', 'DT', 'NN', 'NN', '.'],
  'sentence': ['French author who helped pioner the science-fiction genre.'],
  'spell': ['French',
   'author',
   'who',
   'helped',
   'pioneer',
   'the',
   'sciencefiction',
   'genre',
   '.'],
  'stems': ['french',
   'author',
   'who',
   'help',
   'pioneer',
   'the',
   'sciencefict',
   'genr',
   '.'],
  'token': ['French',
   'author',
   'who',
   'helped',
   'pioner',
   'the',
   'science-fiction',
   'genre',
   '.']},
 {'document': ['Verne wrate about space, air, and underwater travel before navigable aircrast'],
  'lemmas': ['Verne',
   'write',
   'about',
   'space',
   ',',
   'air',
   ',',
   'and',
   'underwater',
   'travel',
   'before',
   'navigable',
   'aircraft'],
  'pos': ['NNP',
   'VB

In [60]:
result_list[0]

{'document': ['French author who helped pioner the science-fiction genre.'],
 'lemmas': ['French',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'pos': ['JJ', 'NN', 'WP', 'VBD', 'NN', 'DT', 'NN', 'NN', '.'],
 'sentence': ['French author who helped pioner the science-fiction genre.'],
 'spell': ['French',
  'author',
  'who',
  'helped',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'stems': ['french',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefict',
  'genr',
  '.'],
 'token': ['French',
  'author',
  'who',
  'helped',
  'pioner',
  'the',
  'science-fiction',
  'genre',
  '.']}

**Using fullAnnotate to get more details**



```
annotatorType: String, 
begin: Int, 
end: Int, 
result: String, (this is what annotate returns)
metadata: Map[String, String], 
embeddings: Array[Float]
```



In [64]:
text = 'Peter Parker is a nice guy and lives in New York'

In [65]:
# pipeline_dl >> explain_document_dl

detailed_result = pipeline_dl.fullAnnotate(text)

In [68]:
detailed_result


[{'checked': [Annotation(token, 0, 4, Peter, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 6, 11, Parker, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 13, 14, is, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 16, 16, a, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 18, 21, nice, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 23, 25, guy, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 27, 29, and, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 31, 35, lives, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 37, 38, in, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 40, 42, New, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 44, 47, York, {'confidence': '1.0', 'sentence': '0'})],
  'document': [Annotation(document, 0, 47, Peter Parker is a nice guy and lives in New York, {})],
  'embeddings': [Annotation(word_embeddings, 0, 4, Peter, {'is

In [69]:
detailed_result[0]['entities']

[Annotation(chunk, 0, 11, Peter Parker, {'entity': 'PER', 'sentence': '0', 'chunk': '0'}),
 Annotation(chunk, 40, 47, New York, {'entity': 'LOC', 'sentence': '0', 'chunk': '1'})]

In [72]:
detailed_result[0]['entities'][1].result

'New York'

In [73]:
chunks=[]
entities=[]
for n in detailed_result[0]['entities']:
        
  chunks.append(n.result)
  entities.append(n.metadata['entity']) 
    
df = pd.DataFrame({'chunks':chunks, 'entities':entities})
df  

Unnamed: 0,chunks,entities
0,Peter Parker,PER
1,New York,LOC


In [74]:
tuples = []

for x,y,z in zip(detailed_result[0]["token"], detailed_result[0]["pos"], detailed_result[0]["ner"]):

  tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end, y.result, z.result))

df = pd.DataFrame(tuples, columns=['sent_id','token','start','end','pos', 'ner'])

df

Unnamed: 0,sent_id,token,start,end,pos,ner
0,0,Peter,0,4,NNP,B-PER
1,0,Parker,6,11,NNP,I-PER
2,0,is,13,14,VBZ,O
3,0,a,16,16,DT,O
4,0,nice,18,21,JJ,O
5,0,guy,23,25,NN,O
6,0,and,27,29,CC,O
7,0,lives,31,35,NNS,O
8,0,in,37,38,IN,O
9,0,New,40,42,NNP,B-LOC


**Sentiment Analysis**

**Vivek algo**



```
Vivek algo
paper: Fast and accurate sentiment classification using an enhanced Naive Bayes model

https://arxiv.org/abs/1305.6143

code https://github.com/vivekn/sentiment
```



In [75]:
sentiment = PretrainedPipeline('analyze_sentiment', lang='en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [76]:
result = sentiment.annotate("The movie I watched today was not a good one")

result['sentiment']

['negative']

**DL version (trained on imdb)**

In [80]:
sentiment_imdb = PretrainedPipeline('analyze_sentimentdl_use_imdb', lang='en')

analyze_sentimentdl_use_imdb download started this may take some time.
Approx size to download 935.7 MB
[OK!]


In [85]:
sentiment_imdb_glove = PretrainedPipeline('analyze_sentimentdl_glove_imdb', lang='en')

analyze_sentimentdl_glove_imdb download started this may take some time.
Approx size to download 155.3 MB
[OK!]


In [86]:
comment = '''
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
'''
result = sentiment_imdb_glove.annotate(comment)

result['sentiment']

['pos']

In [87]:
sentiment_imdb_glove.fullAnnotate(comment)[0]['sentiment']

[Annotation(category, 0, 433, pos, {'sentence': '0', 'pos': '0.98675287', 'neg': '0.013247096'})]