**spaCy’s trained pipelines** can be installed as **Python packages**. This means that they’re a component of your application, just like any other module. They’re versioned and can be defined as a dependency in your requirements.txt. Trained pipelines can be installed from a download URL or a local directory, manually or via pip. Their data can be located anywhere on your file system.


In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [2]:
doc = nlp("We are learning NLP using spaCy.")
print(type(doc))
print(len(doc))
print([token.text for token in doc])

<class 'spacy.tokens.doc.Doc'>
7
['We', 'are', 'learning', 'NLP', 'using', 'spaCy', '.']


In [3]:
dir(doc)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment',
 'sents',
 'set

In [4]:
doc.__dir__()

['__new__',
 '__repr__',
 '__str__',
 '__iter__',
 '__init__',
 '__len__',
 '__getitem__',
 'set_extension',
 'get_extension',
 'has_extension',
 'remove_extension',
 'has_annotation',
 '__unicode__',
 '__bytes__',
 'char_span',
 'similarity',
 'set_ents',
 'to_array',
 'count_by',
 '_realloc',
 'from_array',
 'from_docs',
 'get_lca_matrix',
 'copy',
 'to_disk',
 'from_disk',
 'to_bytes',
 'from_bytes',
 'to_dict',
 'from_dict',
 'extend_tensor',
 'retokenize',
 '_bulk_merge',
 'from_json',
 'to_json',
 'to_utf8_array',
 '_get_array_attrs',
 '_',
 'is_tagged',
 'is_parsed',
 'is_nered',
 'is_sentenced',
 'doc',
 'has_vector',
 'vector',
 'vector_norm',
 'text',
 'text_with_ws',
 'ents',
 'noun_chunks',
 'sents',
 'lang',
 'lang_',
 'mem',
 'vocab',
 '_vector',
 '_vector_norm',
 'tensor',
 'cats',
 'user_data',
 'spans',
 'sentiment',
 'user_hooks',
 'user_token_hooks',
 'user_span_hooks',
 'has_unknown_spaces',
 '_context',
 'noun_chunks_iterator',
 '__doc__',
 '__pyx_vtable__',
 '__re

https://spacy.io/usage/spacy-101<br>
spaCy is designed specifically for production use and helps you build applications that process and “understand” large volumes of text. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.

|NAME             |  DESCRIPTION    |
|-----------------|-----------------|
|1.Tokenization	| Segmenting text into words, punctuations marks etc.|
|2.Part-of-speech (POS) Tagging |	Assigning word types to tokens, like verb or noun.|
|3.Dependency Parsing	 | Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.|
|4.Lemmatization |	Assigning the base forms of words. For example, the lemma of “was” is “be”, and the lemma of “rats” is “rat”.|
|5.Sentence Boundary Detection (SBD) |	Finding and segmenting individual sentences.|
|6.Named Entity Recognition (NER) |	Labelling named “real-world” objects, like persons, companies or locations.|
|7.Entity Linking (EL)|	Disambiguating textual entities to unique identifiers in a knowledge base.|
|8.Similarity |	Comparing words, text spans and documents and how similar they are to each other.|
|9.Text Classification |	Assigning categories or labels to a whole document, or parts of a document.|
|10.Rule-based Matching |	Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions.|
|11.Training |	Updating and improving a statistical model’s predictions.|
|12.Serialization | Saving objects to files or byte strings.|

In [5]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


# 1.Tokenization

In [6]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text)
print(f" token.text is type :{type(token.text)}")
print(f" token.text[0:5] is : {token.text[0:5]}")

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion
 token.text is type :<class 'str'>
 token.text[0:5] is : billi


In [7]:
# si quiero guardarlos tengo que generarme una lista ??
lista = []
for token in doc:
    lista.append(token)
print(lista)

[Apple, is, looking, at, buying, U.K., startup, for, $, 1, billion]


# 2.Part-of-speech tags and dependencies NEEDS MODEL

In [8]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


Visualizing the dependency parse

In [9]:
from spacy import displacy
displacy.render(doc, style="dep") # he canviado 'serve' por 'render'
# He parado el kernell pensando que se habia atascado.
# Al parar el kernel me ha salido el esquema
# Y el kernell inicial seguía en funcionamiento.
# Por lo visto lanza otro kernell en paralelo.

In [10]:
print("Sigo teniendo kernell")

Sigo teniendo kernell


Visualizing the entity recognizer

In [11]:
text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

doc = nlp(text)
displacy.render(doc, style="ent")

In [12]:
print("He interrumpido un Kernell, pero sigo con otro")

He interrumpido un Kernell, pero sigo con otro


Adding titles to documents

In [13]:
oc = nlp("This is a sentence about Google.")
doc.user_data["title"] = "This is a title"
displacy.serve(doc, style="ent")




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


# 3.Named Entities NEEDS MODEL

In [14]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


Named Entity Recognition<br>
To learn more about entity recognition in spaCy, how to add your own entities to a document and how to train and update the entity predictions of a model, see the usage guides on named entity recognition and training pipelines.

# 4.Lemmatization V3.0
https://spacy.io/usage/linguistic-features#lemmatization

In [24]:
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']


rule
['I', 'be', 'read', 'the', 'paper', '.']


# 4.Word vectors and similarity NEEDS MODEL

In [16]:
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 75.254234 False
cat True 63.188496 False
banana True 31.620354 False
afskfsd False 0.0 True


In [17]:
doc1 = nlp("I like salty fries and hamburgers.")
doc2 = nlp("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]
print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.691649353055761
salty fries <-> hamburgers 0.6938489675521851


# 20.Morfology

In [18]:
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']

Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Case=Nom|Number=Sing|Person=1|PronType=Prs
['Prs']


In [19]:
print(type(token.morph))


<class 'spacy.tokens.morphanalysis.MorphAnalysis'>


In [20]:
token.morph.__dir__()

['__new__',
 '__repr__',
 '__hash__',
 '__str__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__iter__',
 '__init__',
 '__len__',
 '__contains__',
 'from_id',
 'get',
 'to_json',
 'to_dict',
 'vocab',
 'key',
 '__doc__',
 '__reduce__',
 '__setstate__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__reduce_ex__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']

In [21]:
token.morph.to_dict()

{'Case': 'Nom', 'Number': 'Sing', 'Person': '1', 'PronType': 'Prs'}

In [22]:
token.morph.to_json()

'Case=Nom|Number=Sing|Person=1|PronType=Prs'

In [23]:
print(token.morph.vocab)
print(type(token.morph.vocab))
print(token.morph.vocab.__repr__)

<spacy.vocab.Vocab object at 0x7fa477aee560>
<class 'spacy.vocab.Vocab'>
<method-wrapper '__repr__' of spacy.vocab.Vocab object at 0x7fa477aee560>
