In [4]:
import spacy

# the blank spacy langauge processing pipeline that can ONLY tokenize.. thats the only thing it can do by default. but we do not have a PIPELINE
nlp = spacy.blank("en")
doc = nlp("Captain america ate %100 of samosa. Then he said I can do this all day.")

for token in doc: 
    print(token)



Captain
america
ate
%
100
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [5]:
nlp.pipe_names # this will be blank

[]

In [11]:
# we can download a PRE TRAINED PIEPLINE via pip i.e. python3 -m spacy download en_core_web_sm
# https://spacy.io/usage/models#quicksttart
# thre will be a different pre trained pipeline for diff langgugaes.. the above is for en

nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names) # using this pre trained pipeline means we get a MUCH better model with loads of pre trained steps i.e. ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc: 
    # this is amazing, you can see if a word is a noun, verb, num, punct etc (pos_) or the base word (lemma_)
    print(token, " | ", token.pos_, " | ", token.lemma_)


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Captain  |  PROPN  |  Captain
america  |  PROPN  |  america
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosa  |  PROPN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
this  |  PRON  |  this
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


In [12]:
doc = nlp("Tesla Inc is going to aquire twitter for $45 billion")
# In the given context, doc.ents refers to the named entities recognized in the document doc by spaCy's Named Entity Recognition (NER) system. 
#A named entity is a real-world object, such as a person, location, organization, product, date, etc., that can be denoted with a proper name. 
# It's identified and classified by the NER component in the spaCy pipeline.
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [14]:
from spacy import displacy
displacy.render(doc,style="ent")