In [1]:
import spacy
import benepar
from spacy.language import Language
from spacy_langdetect import LanguageDetector
from spacy import displacy

In [2]:
nlp = spacy.load("en_core_web_trf")

In [3]:
# Add LanguageDetector and assign it a string name
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [4]:
nlp.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1fe2decf670>

In [5]:
benepar.download('benepar_en3_large')

[nltk_data] Downloading package benepar_en3_large to
[nltk_data]     C:\Users\gs199\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3_large is already up-to-date!


True

In [6]:
nlp.add_pipe("benepar", config={"model": "benepar_en3"})

<benepar.integrations.spacy_plugin.BeneparComponent at 0x1fe2dece980>

In [7]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")



In [8]:
print([(w.text, w.pos_) for w in doc])

[('Apple', 'PROPN'), ('is', 'AUX'), ('looking', 'VERB'), ('at', 'ADP'), ('buying', 'VERB'), ('U.K.', 'PROPN'), ('startup', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), ('1', 'NUM'), ('billion', 'NUM')]


In [9]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [10]:
for token in doc:
    print(token.text, token.morph.to_dict())

Apple {'Number': 'Sing'}
is {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at {}
buying {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. {'Number': 'Sing'}
startup {'Number': 'Sing'}
for {}
$ {}
1 {'NumType': 'Card'}
billion {'NumType': 'Card'}


In [11]:
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999933217650165}
Apple is looking at buying U.K. startup for $1 billion {'language': 'en', 'score': 0.9999947322839472}


In [12]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Apple nsubj looking {'Number': 'Sing'}
is aux looking {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking ROOT looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at prep looking {}
buying pcomp at {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. compound startup {'Number': 'Sing'}
startup dobj buying {'Number': 'Sing'}
for prep buying {}
$ quantmod billion {}
1 compound billion {'NumType': 'Card'}
billion pobj for {'NumType': 'Card'}


In [13]:
sent = list(doc.sents)[0]
print(sent._.parse_string)
sent._.labels
list(sent._.children)[0]

(S (NP (NNP Apple)) (VP (VBZ is) (VP (VBG looking) (PP (IN at) (S (VP (VBG buying) (NP (NNP U.K.) (NN startup)) (PP (IN for) (NP (QP ($ $) (CD 1) (CD billion))))))))))


Apple

In [14]:
nlp_ru = spacy.load("ru_core_news_lg")

In [15]:
nlp_ru.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x1ff131e5c30>

In [16]:
doc_ru = nlp_ru("Покажи все организации из города Минска и Пинска.")

In [17]:
print([(w.text, w.pos_) for w in doc_ru])

[('Покажи', 'VERB'), ('все', 'DET'), ('организации', 'NOUN'), ('из', 'ADP'), ('города', 'NOUN'), ('Минска', 'PROPN'), ('и', 'CCONJ'), ('Пинска', 'PROPN'), ('.', 'PUNCT')]


In [18]:
sent_n = 1
for sent in doc_ru.sents:
  print('sent #{}'.format(sent_n))
  sent_n += 1        
  for token in sent:
      print('############################################################')
      print(token.text, token.i)
      print(token.lemma_, token.pos_, token.tag_, token.shape_, token.is_alpha, token.is_stop)
      print(token.dep_, token.head.i, token.head.text, token.head.cluster)
      print([an.i for an in token.head.ancestors], [child for child in list(token.head.children)], list(token.head.conjuncts))
      print(token.morph.to_dict())        

sent #1
############################################################
Покажи 0
показать VERB VERB Xxxxx True False
ROOT 0 Покажи 0
[] [организации, .] []
{'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': 'Second', 'VerbForm': 'Fin', 'Voice': 'Act'}
############################################################
все 1
весь DET DET xxx True True
det 2 организации 0
[0] [все, города] []
{'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}
############################################################
организации 2
организация NOUN NOUN xxxx True False
obj 0 Покажи 0
[] [организации, .] []
{'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}
############################################################
из 3
из ADP ADP xx True True
case 4 города 0
[2, 0] [из, Минска] []
{}
############################################################
города 4
город NOUN NOUN xxxx True False
nmod 2 организации 0
[0] [все, города] []
{'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'N

In [19]:
for token in doc_ru:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Покажи ROOT Покажи {'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'Sing', 'Person': 'Second', 'VerbForm': 'Fin', 'Voice': 'Act'}
все det организации {'Animacy': 'Inan', 'Case': 'Acc', 'Number': 'Plur'}
организации obj Покажи {'Animacy': 'Inan', 'Case': 'Acc', 'Gender': 'Fem', 'Number': 'Plur'}
из case города {}
города nmod организации {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
Минска appos города {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
и cc Пинска {}
Пинска conj Минска {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
. punct Покажи {}


In [20]:
# document level language detection. Think of it like average language of the document!
print(doc_ru._.language)
# sentence level language detection
for sent in doc_ru.sents:
   print(sent, sent._.language)

{'language': 'ru', 'score': 0.7735570081723655}
Покажи все организации из города Минска и Пинска. {'language': 'mk', 'score': 0.5713641372207318}


In [21]:
doc2 = nlp("Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение.")

In [22]:
# document level language detection. Think of it like average language of the document!
print(doc2._.language)
# sentence level language detection
for sent in doc2.sents:
   print(sent, sent._.language)

{'language': 'bg', 'score': 0.8571391522991124}
Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение. {'language': 'bg', 'score': 0.8571390036162498}


In [23]:
doc = nlp("show me all companies from minsk and pinsk")
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
