In [23]:
import spacy
import benepar
from spacy.language import Language
from spacy_langdetect import LanguageDetector

In [24]:
nlp = spacy.load("en_core_web_trf")

In [25]:
# Add LanguageDetector and assign it a string name
@Language.factory("language_detector")
def create_language_detector(nlp, name):
    return LanguageDetector(language_detection_function=None)

In [26]:
nlp.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x2590c878d60>

In [27]:
benepar.download('benepar_en3_large')

[nltk_data] Downloading package benepar_en3_large to
[nltk_data]     C:\Users\gs199\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3_large is already up-to-date!


True

In [28]:
nlp.add_pipe("benepar", config={"model": "benepar_en3"})

<benepar.integrations.spacy_plugin.BeneparComponent at 0x2590c879ea0>

In [29]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

In [30]:
print([(w.text, w.pos_) for w in doc])

[('Apple', 'PROPN'), ('is', 'AUX'), ('looking', 'VERB'), ('at', 'ADP'), ('buying', 'VERB'), ('U.K.', 'PROPN'), ('startup', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), ('1', 'NUM'), ('billion', 'NUM')]


In [31]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [32]:
for token in doc:
    print(token.text, token.morph.to_dict())

Apple {'Number': 'Sing'}
is {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at {}
buying {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. {'Number': 'Sing'}
startup {'Number': 'Sing'}
for {}
$ {}
1 {'NumType': 'Card'}
billion {'NumType': 'Card'}


In [33]:
# document level language detection. Think of it like average language of the document!
print(doc._.language)
# sentence level language detection
for sent in doc.sents:
   print(sent, sent._.language)

{'language': 'en', 'score': 0.9999958307002831}
Apple is looking at buying U.K. startup for $1 billion {'language': 'en', 'score': 0.9999945819208242}


In [34]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Apple nsubj looking {'Number': 'Sing'}
is aux looking {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
looking ROOT looking {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
at prep looking {}
buying pcomp at {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
U.K. compound startup {'Number': 'Sing'}
startup dobj buying {'Number': 'Sing'}
for prep buying {}
$ quantmod billion {}
1 compound billion {'NumType': 'Card'}
billion pobj for {'NumType': 'Card'}


In [35]:
sent = list(doc.sents)[0]
print(sent._.parse_string)
sent._.labels
list(sent._.children)[0]

(S (NP (NNP Apple)) (VP (VBZ is) (VP (VBG looking) (PP (IN at) (S (VP (VBG buying) (NP (NNP U.K.) (NN startup)) (PP (IN for) (NP (QP ($ $) (CD 1) (CD billion))))))))))


Apple

In [36]:
nlp_ru = spacy.load("ru_core_news_lg")

In [37]:
nlp_ru.add_pipe("language_detector")

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x25a13167130>

In [38]:
doc_ru = nlp_ru("Юрий хороший и быстрый программист из города Минска. Сегодня идет дождь.")

In [39]:
print([(w.text, w.pos_) for w in doc_ru])

[('Юрий', 'PROPN'), ('хороший', 'ADJ'), ('и', 'CCONJ'), ('быстрый', 'ADJ'), ('программист', 'NOUN'), ('из', 'ADP'), ('города', 'NOUN'), ('Минска', 'PROPN'), ('.', 'PUNCT'), ('Сегодня', 'ADV'), ('идет', 'VERB'), ('дождь', 'NOUN'), ('.', 'PUNCT')]


In [45]:
sent_n = 1
for sent in doc_ru.sents:
  print('sent #{}'.format(sent_n))
  sent_n += 1        
  for token in sent:
      print('############################################################')
      print(token.text, token.i)
      print(token.lemma_, token.pos_, token.tag_, token.shape_, token.is_alpha, token.is_stop)
      print(token.dep_, token.head.i, token.head.text, token.head.cluster)
      print([an.i for an in token.head.ancestors], [child for child in list(token.head.children)], list(token.head.conjuncts))
      print(token.morph.to_dict())        

sent #1
############################################################
Юрий 0
юрий PROPN PROPN Xxxx True False
nsubj 4 программист 0
[] [Юрий, хороший, города, .] []
{'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}
############################################################
хороший 1
хороший ADJ ADJ xxxx True False
amod 4 программист 0
[] [Юрий, хороший, города, .] []
{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}
############################################################
и 2
и CCONJ CCONJ x True True
cc 3 быстрый 0
[1, 4] [и] [хороший]
{}
############################################################
быстрый 3
быстрый ADJ ADJ xxxx True False
conj 1 хороший 0
[4] [быстрый] [быстрый]
{'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}
############################################################
программист 4
программист NOUN NOUN xxxx True False
ROOT 4 программист 0
[] [Юрий, хороший, города, .] []
{'Animacy': 'Anim', 'Case': '

In [41]:
for token in doc_ru:
    print(token.text, token.dep_, token.head.text, token.morph.to_dict())

Юрий nsubj программист {'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}
хороший amod программист {'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}
и cc быстрый {}
быстрый conj хороший {'Case': 'Nom', 'Degree': 'Pos', 'Gender': 'Masc', 'Number': 'Sing'}
программист ROOT программист {'Animacy': 'Anim', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}
из case города {}
города nmod программист {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
Минска appos города {'Animacy': 'Inan', 'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing'}
. punct программист {}
Сегодня advmod идет {'Degree': 'Pos'}
идет ROOT идет {'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Sing', 'Person': 'Third', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}
дождь nsubj идет {'Animacy': 'Inan', 'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing'}
. punct идет {}


In [42]:
# document level language detection. Think of it like average language of the document!
print(doc_ru._.language)
# sentence level language detection
for sent in doc_ru.sents:
   print(sent, sent._.language)

{'language': 'ru', 'score': 0.9999974726520381}
Юрий хороший и быстрый программист из города Минска. {'language': 'ru', 'score': 0.9999977042411683}
Сегодня идет дождь. {'language': 'ru', 'score': 0.9999969421773983}


In [43]:
doc2 = nlp("Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение.")

In [44]:
# document level language detection. Think of it like average language of the document!
print(doc2._.language)
# sentence level language detection
for sent in doc2.sents:
   print(sent, sent._.language)

{'language': 'bg', 'score': 0.8571378300637023}
Apple is looking at buying U.K. startup for $1 billion. Это предложение не на английском. Третье предложение. {'language': 'bg', 'score': 0.9999952002347805}
