In [50]:
import spacy
import benepar
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy import displacy

In [51]:
nlp = spacy.load("en_core_web_trf")

In [52]:
# Add LanguageDetector and assign it a string name
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [53]:
nlp.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1c98d165330>

In [54]:
benepar.download('benepar_en3_large')

[nltk_data] Downloading package benepar_en3_large to
[nltk_data]     C:\Users\gs199\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3_large is already up-to-date!


True

In [55]:
nlp.add_pipe("benepar", config={"model": "benepar_en3"})

<benepar.integrations.spacy_plugin.BeneparComponent at 0x1c98d1668c0>

In [56]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")



In [57]:
print([(w.text, w.pos_) for w in doc])

[('Apple', 'PROPN'), ('is', 'AUX'), ('looking', 'VERB'), ('at', 'ADP'), ('buying', 'VERB'), ('U.K.', 'PROPN'), ('startup', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), ('1', 'NUM'), ('billion', 'NUM')]


In [58]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, token.ent_type)

Apple Apple PROPN NNP nsubj Xxxxx True False 383
is be AUX VBZ aux xx True True 0
looking look VERB VBG ROOT xxxx True False 0
at at ADP IN prep xx True True 0
buying buy VERB VBG pcomp xxxx True False 0
U.K. U.K. PROPN NNP compound X.X. False False 384
startup startup NOUN NN dobj xxxx True False 0
for for ADP IN prep xxx True True 0
$ $ SYM $ quantmod $ False False 394
1 1 NUM CD compound d False False 394
billion billion NUM CD pobj xxxx True False 394


In [59]:
for ent in doc.ents:
    print(ent, ent.lemma_, ent.label_)

Apple Apple ORG
U.K. U.K. GPE
$1 billion $1 billion MONEY


In [60]:
for token in doc:
    print(token.text, token.morph.to_dict())

Apple {'Number': 'Sing'}
is {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at {}
buying {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. {'Number': 'Sing'}
startup {'Number': 'Sing'}
for {}
$ {}
1 {'NumType': 'Card'}
billion {'NumType': 'Card'}


In [61]:
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999962737761534}
Apple is looking at buying U.K. startup for $1 billion {'language': 'en', 'score': 0.9999954897589787}


In [62]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Apple nsubj looking {'Number': 'Sing'}
is aux looking {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking ROOT looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at prep looking {}
buying pcomp at {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. compound startup {'Number': 'Sing'}
startup dobj buying {'Number': 'Sing'}
for prep buying {}
$ quantmod billion {}
1 compound billion {'NumType': 'Card'}
billion pobj for {'NumType': 'Card'}


In [63]:
sent = list(doc.sents)[0]
print(sent._.parse_string)
sent._.labels
list(sent._.children)[0]

(S (NP (NNP Apple)) (VP (VBZ is) (VP (VBG looking) (PP (IN at) (S (VP (VBG buying) (NP (NNP U.K.) (NN startup)) (PP (IN for) (NP (QP ($ $) (CD 1) (CD billion))))))))))


Apple

In [64]:
nlp_ru = spacy.load("ru_core_news_lg")

In [65]:
nlp_ru.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1c8c45df550>

In [66]:
doc_ru = nlp_ru("Покажи все организации из города Минска и Пинска.")

In [67]:
print([(w.text, w.pos_) for w in doc_ru])

[('Покажи', 'VERB'), ('все', 'DET'), ('организации', 'NOUN'), ('из', 'ADP'), ('города', 'NOUN'), ('Минска', 'PROPN'), ('и', 'CCONJ'), ('Пинска', 'PROPN'), ('.', 'PUNCT')]


In [68]:
sent_n = 1
for sent in doc_ru.sents:
  print('sent #{}'.format(sent_n))
  sent_n += 1        
  for token in sent:
      print('############################################################')
      print(token.text, token.i)
      print(token.lemma_, token.pos_, token.tag_, token.shape_, token.is_alpha, token.is_stop)
      print(token.dep_, token.head.i, token.head.text, token.head.cluster)
      print([an.i for an in token.head.ancestors], [child for child in list(token.head.children)], list(token.head.conjuncts))
      print(token.morph.to_dict())        

sent #1
############################################################
Покажи 0
показать VERB VERB Xxxxx True False
ROOT 0 Покажи 0
[] [организации, .] []
{'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': 'Second', 'VerbForm': 'Fin', 'Voice': 'Act'}
############################################################
все 1
весь DET DET xxx True True
det 2 организации 0
[0] [все, города] []
{'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}
############################################################
организации 2
организация NOUN NOUN xxxx True False
obj 0 Покажи 0
[] [организации, .] []
{'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}
############################################################
из 3
из ADP ADP xx True True
case 4 города 0
[2, 0] [из, Минска] []
{}
############################################################
города 4
город NOUN NOUN xxxx True False
nmod 2 организации 0
[0] [все, города] []
{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'N

In [69]:
for token in doc_ru:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Покажи ROOT Покажи {'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': 'Second', 'VerbForm': 'Fin', 'Voice': 'Act'}
все det организации {'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}
организации obj Покажи {'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}
из case города {}
города nmod организации {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
Минска appos города {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
и cc Пинска {}
Пинска conj Минска {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
. punct Покажи {}


In [70]:
# document level language detection. Think of it like average language of the document!
print(doc_ru._.language)
# sentence level language detection
for sent in doc_ru.sents:
   print(sent, sent._.language)

{'language': 'ru', 'score': 0.6673584059268076}
Покажи все организации из города Минска и Пинска. {'language': 'mk', 'score': 0.5383230320561317}


In [71]:
for ent in doc_ru.ents:
    print(ent, ent.lemma_, ent.label_)

Минска минск LOC
Пинска пинск LOC


In [106]:
doc_ru = nlp_ru("Василий Петрович Кукушкин купил машину жигули в городе Минске")

In [107]:
for ent in doc_ru.ents:
    print(ent, ent.ent_id, ent.lemma_, ent.label_, ent.start_char, ent.end_char)

Василий Петрович Кукушкин 0 василий петрович кукушкин PER 0 25
Минске 0 минск LOC 55 61


In [108]:
for token in doc_ru:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict(), token._._start, token.ent_type_)

Василий nsubj купил {'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'} 0 PER
Петрович appos Василий {'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'} 8 PER
Кукушкин flat:name Петрович {'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'} 17 PER
купил ROOT купил {'Aspect': 'Perf', 'Gender': 'Masc', 'Mood': 'Ind', 'Number': 'Sing', 'Tense': 'Past', 'VerbForm': 'Fin', 'Voice': 'Act'} 26 
машину obj купил {'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Sing'} 32 
жигули obj купил {'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Masc', 'Number': 'Plur'} 39 
в case городе {} 46 
городе nmod жигули {'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'} 48 
Минске appos городе {'Animacy': 'Inan', 'Case': 'Loc', 'Gender': 'Masc', 'Number': 'Sing'} 55 LOC


In [72]:
doc2 = nlp("Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение.")

In [73]:
# document level language detection. Think of it like average language of the document!
print(doc2._.language)
# sentence level language detection
for sent in doc2.sents:
   print(sent, sent._.language)

{'language': 'bg', 'score': 0.8571383772323066}
Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение. {'language': 'bg', 'score': 0.8571395960675282}


In [74]:
doc = nlp("show me all companies from minsk and pinsk")
#displacy.serve(doc, style="dep")