In [2]:
import spacy.cli
from pathlib import Path

spacy.cli.download("en_core_web_sm")
#spacy.cli.download("en_core_web_md")
#spacy.cli.download("en_core_web_lg")

#Carregando o modelo e o texto
nlp = spacy.load("en_core_web_sm")
texto = "There were 70 children there. Preliminary findings were reported in today's New England Journal of Medicine. Earnings growth took a back seat. A small building in the back. A clear majority of senators back the bill. Enable the country to buy back debt. I was twenty-one back then."
doc = nlp(texto)
print(doc.text)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
There were 70 children there. Preliminary findings were reported in today's New England Journal of Medicine. Earnings growth took a back seat. A small building in the back. A clear majority of senators back the bill. Enable the country to buy back debt. I was twenty-one back then.


In [3]:
#Tokenização, Lematização e POS.
#https://spacy.io/api/token#attributes
print("\n(TOKEN, LEMA)")
for token in doc:
    print((token.text, token.lemma_))


(TOKEN, LEMA)
('There', 'there')
('were', 'be')
('70', '70')
('children', 'child')
('there', 'there')
('.', '.')
('Preliminary', 'preliminary')
('findings', 'finding')
('were', 'be')
('reported', 'report')
('in', 'in')
('today', 'today')
("'s", "'s")
('New', 'New')
('England', 'England')
('Journal', 'Journal')
('of', 'of')
('Medicine', 'Medicine')
('.', '.')
('Earnings', 'earning')
('growth', 'growth')
('took', 'take')
('a', 'a')
('back', 'back')
('seat', 'seat')
('.', '.')
('A', 'a')
('small', 'small')
('building', 'building')
('in', 'in')
('the', 'the')
('back', 'back')
('.', '.')
('A', 'a')
('clear', 'clear')
('majority', 'majority')
('of', 'of')
('senators', 'senator')
('back', 'back')
('the', 'the')
('bill', 'bill')
('.', '.')
('Enable', 'enable')
('the', 'the')
('country', 'country')
('to', 'to')
('buy', 'buy')
('back', 'back')
('debt', 'debt')
('.', '.')
('I', 'I')
('was', 'be')
('twenty', 'twenty')
('-', '-')
('one', 'one')
('back', 'back')
('then', 'then')
('.', '.')


In [4]:
print("\n(TOKEN, LEMA,POS)")
for token in doc:
    print((token.text, token.lemma_, token.pos_))


(TOKEN, LEMA,POS)
('There', 'there', 'PRON')
('were', 'be', 'VERB')
('70', '70', 'NUM')
('children', 'child', 'NOUN')
('there', 'there', 'ADV')
('.', '.', 'PUNCT')
('Preliminary', 'preliminary', 'ADJ')
('findings', 'finding', 'NOUN')
('were', 'be', 'AUX')
('reported', 'report', 'VERB')
('in', 'in', 'ADP')
('today', 'today', 'NOUN')
("'s", "'s", 'PART')
('New', 'New', 'PROPN')
('England', 'England', 'PROPN')
('Journal', 'Journal', 'PROPN')
('of', 'of', 'ADP')
('Medicine', 'Medicine', 'PROPN')
('.', '.', 'PUNCT')
('Earnings', 'earning', 'NOUN')
('growth', 'growth', 'NOUN')
('took', 'take', 'VERB')
('a', 'a', 'DET')
('back', 'back', 'ADJ')
('seat', 'seat', 'NOUN')
('.', '.', 'PUNCT')
('A', 'a', 'DET')
('small', 'small', 'ADJ')
('building', 'building', 'NOUN')
('in', 'in', 'ADP')
('the', 'the', 'DET')
('back', 'back', 'NOUN')
('.', '.', 'PUNCT')
('A', 'a', 'DET')
('clear', 'clear', 'ADJ')
('majority', 'majority', 'NOUN')
('of', 'of', 'ADP')
('senators', 'senator', 'NOUN')
('back', 'back',

In [5]:
print("\n(TOKEN, CARACTERISTICAS MORFOLOGICAS)")
for token in doc:
    print((token.text, token.morph))


(TOKEN, CARACTERISTICAS MORFOLOGICAS)
('There', )
('were', Mood=Ind|Tense=Past|VerbForm=Fin)
('70', NumType=Card)
('children', Number=Plur)
('there', PronType=Dem)
('.', PunctType=Peri)
('Preliminary', Degree=Pos)
('findings', Number=Plur)
('were', Mood=Ind|Tense=Past|VerbForm=Fin)
('reported', Aspect=Perf|Tense=Past|VerbForm=Part)
('in', )
('today', Number=Sing)
("'s", )
('New', Number=Sing)
('England', Number=Sing)
('Journal', Number=Sing)
('of', )
('Medicine', Number=Sing)
('.', PunctType=Peri)
('Earnings', Number=Plur)
('growth', Number=Sing)
('took', Tense=Past|VerbForm=Fin)
('a', Definite=Ind|PronType=Art)
('back', Degree=Pos)
('seat', Number=Sing)
('.', PunctType=Peri)
('A', Definite=Ind|PronType=Art)
('small', Degree=Pos)
('building', Number=Sing)
('in', )
('the', Definite=Def|PronType=Art)
('back', Number=Sing)
('.', PunctType=Peri)
('A', Definite=Ind|PronType=Art)
('clear', Degree=Pos)
('majority', Number=Sing)
('of', )
('senators', Number=Plur)
('back', Tense=Pres|VerbForm=

In [6]:
#https://emorynlp.github.io/nlp4j/components/dependency-parsing.html
texto = "Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR Corp., immediately matched the move, spokesman Tim Wagner said. United, a unit UAK Corp. said the increase took effect Thursday and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Denver to San Francisco."
doc = nlp(texto)

print(doc)
print("\n(TOKEN, DEPENDENCIAS, HEAD)")
for token in doc:
    print((token.text, token.dep_, token.head ))

saida = spacy.displacy.render(doc,style="dep")
arquivo = Path("exemplo.svg")
arquivo.open("w", encoding="utf-8").write(saida)

Citing high fuel prices, United Airlines said Friday it has increased fares by $6 per round trip on flights to some cities also served by lower-cost carriers. American Airlines, a unit of AMR Corp., immediately matched the move, spokesman Tim Wagner said. United, a unit UAK Corp. said the increase took effect Thursday and applies to most routes where it competes against discount carriers, such as Chicago to Dallas and Denver to San Francisco.

(TOKEN, DEPENDENCIAS, HEAD)
('Citing', 'advcl', said)
('high', 'amod', prices)
('fuel', 'compound', prices)
('prices', 'dobj', Citing)
(',', 'punct', said)
('United', 'compound', Airlines)
('Airlines', 'nsubj', said)
('said', 'ROOT', said)
('Friday', 'npadvmod', said)
('it', 'nsubj', increased)
('has', 'aux', increased)
('increased', 'ccomp', said)
('fares', 'dobj', increased)
('by', 'prep', increased)
('$', 'nmod', 6)
('6', 'pobj', by)
('per', 'prep', 6)
('round', 'amod', trip)
('trip', 'pobj', per)
('on', 'prep', trip)
('flights', 'pobj', on)
(

62432

In [7]:
print("\n(TOKEN, NER)")
for token in doc:
    print((token.text,  token.ent_type_ ))


(TOKEN, NER)
('Citing', '')
('high', '')
('fuel', '')
('prices', '')
(',', '')
('United', 'ORG')
('Airlines', 'ORG')
('said', '')
('Friday', 'DATE')
('it', '')
('has', '')
('increased', '')
('fares', '')
('by', '')
('$', '')
('6', 'MONEY')
('per', '')
('round', '')
('trip', '')
('on', '')
('flights', '')
('to', '')
('some', '')
('cities', '')
('also', '')
('served', '')
('by', '')
('lower', '')
('-', '')
('cost', '')
('carriers', '')
('.', '')
('American', 'ORG')
('Airlines', 'ORG')
(',', '')
('a', '')
('unit', '')
('of', '')
('AMR', 'ORG')
('Corp.', 'ORG')
(',', '')
('immediately', '')
('matched', '')
('the', '')
('move', '')
(',', '')
('spokesman', '')
('Tim', 'PERSON')
('Wagner', 'PERSON')
('said', '')
('.', '')
('United', 'ORG')
(',', '')
('a', '')
('unit', '')
('UAK', 'ORG')
('Corp.', 'ORG')
('said', '')
('the', '')
('increase', '')
('took', '')
('effect', '')
('Thursday', 'DATE')
('and', '')
('applies', '')
('to', '')
('most', '')
('routes', '')
('where', '')
('it', '')
('compet

In [8]:
print("\n(TOKEN, PALAVRA, OOV, DIGITO)")
for token in doc:
    print((token.text, token.is_alpha, token.is_oov, token.is_digit))


(TOKEN, PALAVRA, OOV, DIGITO)
('Citing', True, True, False)
('high', True, True, False)
('fuel', True, True, False)
('prices', True, True, False)
(',', False, True, False)
('United', True, True, False)
('Airlines', True, True, False)
('said', True, True, False)
('Friday', True, True, False)
('it', True, True, False)
('has', True, True, False)
('increased', True, True, False)
('fares', True, True, False)
('by', True, True, False)
('$', False, True, False)
('6', False, True, True)
('per', True, True, False)
('round', True, True, False)
('trip', True, True, False)
('on', True, True, False)
('flights', True, True, False)
('to', True, True, False)
('some', True, True, False)
('cities', True, True, False)
('also', True, True, False)
('served', True, True, False)
('by', True, True, False)
('lower', True, True, False)
('-', False, True, False)
('cost', True, True, False)
('carriers', True, True, False)
('.', False, True, False)
('American', True, True, False)
('Airlines', True, True, False)
(

In [9]:
print("\n(TOKEN, TERMINACOES)")
for token in doc:
    print((token.text,token.suffix_))


(TOKEN, TERMINACOES)
('Citing', 'ing')
('high', 'igh')
('fuel', 'uel')
('prices', 'ces')
(',', ',')
('United', 'ted')
('Airlines', 'nes')
('said', 'aid')
('Friday', 'day')
('it', 'it')
('has', 'has')
('increased', 'sed')
('fares', 'res')
('by', 'by')
('$', '$')
('6', '6')
('per', 'per')
('round', 'und')
('trip', 'rip')
('on', 'on')
('flights', 'hts')
('to', 'to')
('some', 'ome')
('cities', 'ies')
('also', 'lso')
('served', 'ved')
('by', 'by')
('lower', 'wer')
('-', '-')
('cost', 'ost')
('carriers', 'ers')
('.', '.')
('American', 'can')
('Airlines', 'nes')
(',', ',')
('a', 'a')
('unit', 'nit')
('of', 'of')
('AMR', 'AMR')
('Corp.', 'rp.')
(',', ',')
('immediately', 'ely')
('matched', 'hed')
('the', 'the')
('move', 'ove')
(',', ',')
('spokesman', 'man')
('Tim', 'Tim')
('Wagner', 'ner')
('said', 'aid')
('.', '.')
('United', 'ted')
(',', ',')
('a', 'a')
('unit', 'nit')
('UAK', 'UAK')
('Corp.', 'rp.')
('said', 'aid')
('the', 'the')
('increase', 'ase')
('took', 'ook')
('effect', 'ect')
('Thu

In [10]:
#https://machinelearningknowledge.ai/tutorial-for-stopwords-in-spacy/
#Lista de Stopwords
stopwords = nlp.Defaults.stop_words
print(len(stopwords))
print(stopwords)
#for token in stopwords:
#  print(token)
print(doc)
print("\n(TOKEN, STOPWORD)")
for token in doc:
    print((token.text, token.is_stop ))

326
{'mine', 'our', 'upon', 'thru', 'above', 'now', 'their', 'otherwise', 'my', 'anywhere', 'whither', 'last', 'both', 'get', 'or', 'one', 'either', 'yet', '‘m', 'not', 'which', 'next', 'her', 'yourselves', 'nevertheless', 'each', 'regarding', 'no', 'further', 'of', "'ll", 'sometimes', 'except', 'keep', 'moreover', 'over', 'i', 'therein', 'whoever', 'others', 'ourselves', 'eleven', 'all', 'whole', '’d', 'say', 'must', 'hereupon', 'rather', 'so', 'using', 'thence', 'n‘t', 'there', 'would', 'for', 'less', 'someone', 'mostly', 'after', 'whence', 'can', 'into', 'only', 'often', 'why', 'just', 'nothing', 'yours', 'front', 'meanwhile', 'will', '’m', 'also', 'thus', 'whereas', 'been', 'themselves', 'whose', 'unless', 'could', 'other', 'towards', "'m", 'if', 'via', 'well', 'still', 'everyone', 'latter', 'call', '‘ve', 'whereby', 'that', 'again', 'always', "'re", 'whenever', 'an', 'please', 'therefore', 'fifty', 'around', 'nowhere', 'seems', 'the', 'whereupon', 'she', 'serious', 'those', 'very'

In [30]:
texto = "Washington was born into slavery on the farm of James Burroughs. Washington went up 2 games to 1 in the four-game series. Blair arrived in Washington for what may weçç be his last state visit. In June, Washington passed a primary seatbelt law."
doc = nlp(texto)
print(doc)

print("\nLista de labels do Spacy para entidades nomeadas:")
print(nlp.pipe_labels['ner'])
for label in nlp.pipe_labels['ner']:
  print(label, ' - ', spacy.explain(label))
print("\n(TOKEN, NER)")

for token in doc:
    print((token.text,  token.ent_type_ ))

Washington was born into slavery on the farm of James Burroughs. Washington went up 2 games to 1 in the four-game series. Blair arrived in Washington for what may weçç be his last state visit. In June, Washington passed a primary seatbelt law.

Lista de labels do Spacy para entidades nomeadas:
['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
CARDINAL  -  Numerals that do not fall under another type
DATE  -  Absolute or relative dates or periods
EVENT  -  Named hurricanes, battles, wars, sports events, etc.
FAC  -  Buildings, airports, highways, bridges, etc.
GPE  -  Countries, cities, states
LANGUAGE  -  Any named language
LAW  -  Named documents made into laws.
LOC  -  Non-GPE locations, mountain ranges, bodies of water
MONEY  -  Monetary values, including unit
NORP  -  Nationalities or religious or political groups
ORDINAL  -  "first", "second", etc.
ORG  -  Comp

In [12]:
from pathlib import Path
html = spacy.displacy.render(doc, style="ent")
output_path = Path("entidades.html")
output_path.open("w", encoding='utf-8').write(html)

3274

In [33]:
import spacy.cli
from pathlib import Path

spacy.cli.download("pt_core_news_sm")
nlp = spacy.load("pt_core_news_sm")
texto = "São Paulo foi um dos apóstolos de Cristo. Eu conheço São Paulo. Eu conheço a cidade de São Paulo. Eu já viajei para o estado de São Paulo. Eu queria estudar na USP. Ele fez CC na Universidade de São Paulo. São Paulo perdeu para o Internacional."
doc = nlp(texto)
print(doc.text)

for token in doc:
    print((token.text,  token.ent_type_ ))

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
São Paulo foi um dos apóstolos de Cristo. Eu conheço São Paulo. Eu conheço a cidade de São Paulo. Eu já viajei para o estado de São Paulo. Eu queria estudar na USP. Ele fez CC na Universidade de São Paulo. São Paulo perdeu para o Internacional.
('São', 'LOC')
('Paulo', 'LOC')
('foi', '')
('um', '')
('dos', '')
('apóstolos', '')
('de', '')
('Cristo', 'PER')
('.', '')
('Eu', '')
('conheço', '')
('São', 'LOC')
('Paulo', 'LOC')
('.', '')
('Eu', '')
('conheço', '')
('a', '')
('cidade', '')
('de', '')
('São', 'LOC')
('Paulo', 'LOC')
('.', '')
('Eu', '')
('já', '')
('viajei', '')
('para', '')
('o', '')
('estado', 'LOC')
('de', 'LOC')
('São', 'LOC')
('Paulo', 'LOC')
('.', '')
('Eu', '')
('queria', '')
('estudar', '')
('na', '')
('USP', 'LOC')
('.', '')
('Ele', '')
('fez', '')
('CC', '')
('na', '')
('Universidade', 'LOC')
('de', 'LOC')
('São', 'LOC')
('Paulo', 'LOC')
('.', '')
('Sã