This notebook illustrates the `NLP()` pipeline on all available languages.

If dependency parse information is available, an example tree is printed, too.

In [1]:
from cltk import NLP
from cltk.dependency.tree import DependencyTree
from cltk.languages.example_texts import get_example_text
from cltk.languages.pipelines import *

In [2]:
iso_to_pipeline = {
    "akk": AkkadianPipeline,
    "ang": OldEnglishPipeline,
    "arb": ArabicPipeline,
    "arc": AramaicPipeline,
    "chu": OCSPipeline,
    "cop": CopticPipeline,
    "enm": MiddleEnglishPipeline,
    "frm": MiddleFrenchPipeline,
    "fro": OldFrenchPipeline,
    "gmh": MiddleHighGermanPipeline,
    "got": GothicPipeline,
    "grc": GreekPipeline,
    "hin": HindiPipeline,
    "lat": LatinPipeline,
    "lzh": ChinesePipeline,
    "non": OldNorsePipeline,
    "pan": PanjabiPipeline,
    "pli": PaliPipeline,
    "san": SanskritPipeline,
}

In [3]:
for lang, pipeline in iso_to_pipeline.items():
    print(f"{pipeline.language.name} ('{pipeline.language.iso_639_3_code}') ...")
    text = get_example_text(lang)
    cltk_nlp = NLP(language=lang)
    cltk_doc = cltk_nlp.analyze(text=text)
    cltk_doc.sentences_strings
    word = cltk_doc.sentences[0][0]
    print("Example `Word`:", word)
    if all([w.features for w in cltk_doc.sentences[0]]):
        print("Printing dependency tree of first sentence ...")
        try:
            a_tree = DependencyTree.to_tree(cltk_doc.sentences[0])
        except:
            print("Dependency tree not available")
            print("")
            continue
        a_tree.print_tree()
    print("")

Akkadian ('akk') ...
Example `Word`: Word(index_char_start=0, index_char_stop=2, index_token=0, index_sentence=None, string=('u2-wa-a-ru', 'akkadian'), pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=None, stop=False, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Old English (ca. 450-1100) ('ang') ...
This part of the CLTK depends upon models from the CLTK project.
Do you want to download 'https://github.com/cltk/ang_models_cltk' to '~/cltk_data/ang'? [Y/n] 
Y
CLTK message: This part of the CLTK depends upon word embedding models from the Fasttext project.
Do you want to download file 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ang.vec' to '/Users/kylejohnson/cltk_data/ang/embeddings/fasttext/wiki.ang.vec'? [Y/n] 
Y


100%|██████████| 34.0M/34.0M [00:01<00:00, 28.3MiB/s]


Example `Word`: Word(index_char_start=0, index_char_stop=5, index_token=0, index_sentence=None, string='Hwæt.', pos=None, lemma='Hwæt.', stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

100%|██████████| 1.61G/1.61G [00:50<00:00, 31.7MiB/s]


Example `Word`: Word(index_char_start=0, index_char_stop=5, index_token=0, index_sentence=None, string='كهيعص', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=array([-0.15707  ,  0.21029  , -0.28787  , -0.24637  , -0.05615  ,
        0.034211 ,  0.11321  ,  0.3705   ,  0.30849  , -0.29225  ,
        0.080341 ,  0.014407 ,  0.28549  ,  0.15705  ,  0.2821   ,
        0.47509  , -0.24151  ,  0.29752  , -0.10217  , -0.11619  ,
       -0.089928 ,  0.34778  , -0.30897  , -0.28535  ,  0.12797  ,
        0.54138  , -0.44139  , -0.15473  , -0.082999 ,  0.13541  ,
       -0.19935  ,  0.019855 ,  0.32807  ,  0.22452  , -0.28052  ,
       -0.056193 , -0.03538  ,  0.11615  , -0.11289  ,  0.11154  ,
        0.18671  ,  0.034115 , -0.07035  , -0.21205  ,  0.18525  ,
       -0.47282  ,  0.46789  , -0.10681  , -0.27898  , -0.12462  ,
        0.03468  ,  0.13603  ,  0.034263 , -0.099267 , -0.19761  ,
    

100%|██████████| 8.66M/8.66M [00:00<00:00, 18.4MiB/s]


Example `Word`: Word(index_char_start=0, index_char_stop=1, index_token=0, index_sentence=None, string='ܒ', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=array([ 7.1381e-02,  2.3568e-02, -1.0312e-01, -2.8307e-01, -1.8968e-01,
       -1.0439e-01, -1.0263e-01, -4.3027e-01, -7.8323e-02,  3.3742e-01,
        1.7024e-01, -5.4787e-02, -2.0458e-01, -1.9334e-01, -2.2998e-02,
        3.0028e-01,  2.2353e-01, -2.7873e-02,  2.3857e-01,  8.2796e-02,
        5.0257e-02, -1.0200e-01,  2.1636e-01,  6.7363e-03, -2.4750e-01,
        1.2792e-01, -1.0595e-01,  2.0944e-01, -2.5115e-01,  7.5266e-02,
       -7.3780e-02, -1.7672e-01,  8.0944e-02,  1.2199e-01,  1.1890e-01,
       -2.6457e-01, -1.6926e-01, -2.7594e-01, -4.2432e-02, -6.7452e-03,
       -2.5720e-01, -8.1060e-03, -3.3508e-01, -1.9403e-02,  5.7786e-02,
        2.1173e-01,  2.0974e-01, -7.5308e-02, -2.0661e-01,  3.4155e-01,
        3.4408e-01, -7.55

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 3.62MB/s]                    
2021-02-14 09:54:23 INFO: Downloading these customized packages for language: cu (Old_Church_Slavonic)...
| Processor | Package |
-----------------------
| tokenize  | proiel  |
| pos       | proiel  |
| lemma     | proiel  |
| depparse  | proiel  |
| pretrain  | proiel  |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cu/tokenize/proiel.pt: 100%|██████████| 629k/629k [00:03<00:00, 204kB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cu/pos/proiel.pt: 100%|██████████| 19.2M/19.2M [00:03<00:00, 5.61MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cu/lemma/proiel.pt: 100%|██████████| 2.58M/2.58M [00:00<00:00, 9.03MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cu/depparse/proiel.pt: 100%|██████████| 96.4M/96.4M [00:05<00:00, 16.2MB/s]
Downloading http://nlp.stanford.edu/software/stanza/

Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='отьчє', pos=noun, lemma='отьць', stem=None, scansion=None, xpos='Nb', upos='NOUN', dependency_relation='vocative', governor=7, features={Case: [vocative], Gender: [masculine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
root | свѧтитъ_7/verb
    └─ vocative | отьчє_0/noun
        └─ nmod | нашь·_1/adjective
    └─ obl | ѥси_3/noun
        └─ case | ижє_2/adposition
    └─ obl | нєбєсѣхъ:_5/noun
        └─ case | на_4/adposition
    └─ mark | да_6/subordinating_conjunction
    └─ expl | сѧ_8/pronoun
    └─ nsubj | имѧ_9/noun
        └─ nmod | твоѥ·_10/adjective
    └─ ccomp | придєтъ_12/verb
        └─ mark | да_11/subordinating_conjunction
        └─ nsubj | цѣсар҄ьствиѥ_13/noun
            └─ nmod | твоѥ·_14/adjective
        

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 5.17MB/s]                    
2021-02-14 09:54:49 INFO: Downloading these customized packages for language: cop (Coptic)...
| Processor | Package     |
---------------------------
| tokenize  | scriptorium |
| mwt       | scriptorium |
| pos       | scriptorium |
| lemma     | scriptorium |
| depparse  | scriptorium |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cop/tokenize/scriptorium.pt: 100%|██████████| 622k/622k [00:00<00:00, 4.50MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cop/mwt/scriptorium.pt: 100%|██████████| 794k/794k [00:00<00:00, 5.48MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cop/pos/scriptorium.pt: 100%|██████████| 20.0M/20.0M [00:01<00:00, 14.4MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/cop/lemma/scriptorium.pt: 100%|██████████| 2.06M/2.06M [00:00<00:00, 10.3MB/s]
Downloading http:/

Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='ⲧⲏⲛ', pos=verb, lemma='ⲧⲏⲛ', stem=None, scansion=None, xpos='VSTAT', upos='VERB', dependency_relation='root', governor=-1, features={VerbForm: [finite]}, category={F: [neg], N: [neg], V: [pos]}, embedding=None, stop=False, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Middle English ('enm') ...
Example `Word`: Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Whilom', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=None, stop=False, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Middle French ('frm') ...
Example `Word`: Word(index_char_start=0, index_char_sto

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 5.07MB/s]                    
2021-02-14 09:56:56 INFO: Downloading these customized packages for language: fro (Old_French)...
| Processor | Package |
-----------------------
| tokenize  | srcmf   |
| pos       | srcmf   |
| depparse  | srcmf   |
| pretrain  | srcmf   |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/fro/tokenize/srcmf.pt: 100%|██████████| 626k/626k [00:00<00:00, 4.52MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/fro/pos/srcmf.pt: 100%|██████████| 21.5M/21.5M [00:00<00:00, 26.4MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/fro/depparse/srcmf.pt: 100%|██████████| 102M/102M [00:03<00:00, 29.2MB/s] 
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/fro/pretrain/srcmf.pt: 100%|██████████| 156M/156M [00:04<00:00, 32.7MB/s] 
2021-02-14 09:57:08 INFO: Finished downloading models and saved to /Users/kylejohn

This part of the CLTK depends upon models from the CLTK project.
Do you want to download 'https://github.com/cltk/fro_models_cltk' to '~/cltk_data/fro'? [Y/n] 
Y


INFO:CLTK:Cloning 'fro_models_cltk' from 'https://github.com/cltk/fro_models_cltk.git'


Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Une', pos=determiner, lemma='Une', stem=None, scansion=None, xpos='DETndf', upos='DET', dependency_relation=None, governor=-1, features={Definiteness: [indefinite], PrononimalType: [article]}, category={F: [pos], N: [pos], V: [neg]}, embedding=None, stop=False, named_entity=False, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Middle High German ('gmh') ...
Example `Word`: Word(index_char_start=0, index_char_stop=3, index_token=0, index_sentence=None, string='Uns', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=None, stop=False, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Gothic ('got') ...
CLTK message: This pa

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 5.26MB/s]                    
2021-02-14 09:59:29 INFO: Downloading these customized packages for language: got (Gothic)...
| Processor | Package |
-----------------------
| tokenize  | proiel  |
| pos       | proiel  |
| lemma     | proiel  |
| depparse  | proiel  |
| pretrain  | proiel  |

INFO:stanza:Downloading these customized packages for language: got (Gothic)...
| Processor | Package |
-----------------------
| tokenize  | proiel  |
| pos       | proiel  |
| lemma     | proiel  |
| depparse  | proiel  |
| pretrain  | proiel  |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/got/tokenize/proiel.pt: 100%|██████████| 623k/623k [00:00<00:00, 4.43MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/got/pos/proiel.pt: 100%|██████████| 19.1M/19.1M [00:00<00:00, 24.0MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/got/lemma/proiel.pt

CLTK message: This part of the CLTK depends upon word embedding models from the Fasttext project.
Do you want to download file 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.vec' to '/Users/kylejohnson/cltk_data/got/embeddings/fasttext/wiki.got.vec'? [Y/n] 
Y


100%|██████████| 6.94M/6.94M [00:00<00:00, 19.9MiB/s]


Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='swa', pos=adverb, lemma='swa', stem=None, scansion=None, xpos='Df', upos='ADV', dependency_relation='advmod', governor=1, features={}, category={F: [neg], N: [pos], V: [pos]}, embedding=array([ 0.039217  , -0.042306  ,  0.059032  , -0.048893  , -0.16648   ,
        0.057962  , -0.27924   , -0.12982   , -0.075599  , -0.1611    ,
       -0.073212  ,  0.21448   , -0.20002   , -0.027713  ,  0.053137  ,
       -0.085541  ,  0.071034  , -0.12977   ,  0.20651   , -0.18188   ,
        0.18756   ,  0.18363   ,  0.14246   ,  0.0041805 , -0.15311   ,
       -0.00089973, -0.046846  ,  0.080707  , -0.067758  , -0.020898  ,
        0.008187  , -0.14267   ,  0.027398  ,  0.026062  ,  0.076555  ,
        0.029481  , -0.17836   ,  0.2223    ,  0.23369   , -0.017388  ,
       -0.12866   , -0.015389  ,  0.05181   , -0.12208   ,  0.0045393 ,
       -0.11369   ,  0.16262   , -0.12083   ,  0.19537   , 

Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='Gallia', pos=noun, lemma='mallis', stem=None, scansion=None, xpos='A1|grn1|casA|gen2', upos='NOUN', dependency_relation='nsubj', governor=3, features={Case: [nominative], Degree: [positive], Gender: [feminine], Number: [singular]}, category={F: [neg], N: [pos], V: [neg]}, embedding=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.,

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 4.83MB/s]                    
2021-02-14 10:02:38 INFO: Downloading these customized packages for language: lzh (Classical_Chinese)...
| Processor | Package |
-----------------------
| tokenize  | kyoto   |
| pos       | kyoto   |
| lemma     | kyoto   |
| depparse  | kyoto   |
| pretrain  | kyoto   |

INFO:stanza:Downloading these customized packages for language: lzh (Classical_Chinese)...
| Processor | Package |
-----------------------
| tokenize  | kyoto   |
| pos       | kyoto   |
| lemma     | kyoto   |
| depparse  | kyoto   |
| pretrain  | kyoto   |

Downloading http://nlp.stanford.edu/software/stanza/1.1.0/lzh/tokenize/kyoto.pt: 100%|██████████| 963k/963k [00:00<00:00, 5.19MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/lzh/pos/kyoto.pt: 100%|██████████| 27.2M/27.2M [00:00<00:00, 28.0MB/s]
Downloading http://nlp.stanford.edu/software/stanza/1.1.0

Example `Word`: Word(index_char_start=None, index_char_stop=None, index_token=0, index_sentence=0, string='黃', pos=noun, lemma='黃', stem=None, scansion=None, xpos='n,名詞,描写,形質', upos='NOUN', dependency_relation='nmod', governor=1, features={}, category={F: [neg], N: [pos], V: [neg]}, embedding=None, stop=None, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
root | ，_3/verb
    └─ nsubj | 者_2/particle
        └─ nmod | 帝_1/noun
            └─ nmod | 黃_0/noun

Old Norse ('non') ...
Example `Word`: Word(index_char_start=0, index_char_stop=5, index_token=0, index_sentence=None, string='Gylfi', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=None, stop=False, named_entity=None, syllables=None, phonetic_transcription=None)
Printing dependency tree of first sentence ...
Dependency tree not available

Eastern Panjabi ('pan') ...
Exampl

100%|██████████| 5.02M/5.02M [00:00<00:00, 14.1MiB/s]


Example `Word`: Word(index_char_start=0, index_char_stop=6, index_token=0, index_sentence=None, string='Raajaa', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

100%|██████████| 129M/129M [00:05<00:00, 25.6MiB/s] 


Example `Word`: Word(index_char_start=0, index_char_stop=3, index_token=0, index_sentence=None, string='ईशा', pos=None, lemma=None, stem=None, scansion=None, xpos=None, upos=None, dependency_relation=None, governor=None, features={}, category={}, embedding=array([-0.023917  , -0.041564  ,  0.083404  , -0.12372   ,  0.011782  ,
       -0.08831   , -0.40554   , -0.37802   ,  0.10282   ,  0.093637  ,
        0.012014  , -0.48742   ,  0.015658  , -0.11113   , -0.012192  ,
        0.13728   ,  0.054348  , -0.23383   ,  0.14157   , -0.075517  ,
        0.17386   , -0.075346  ,  0.023557  , -0.048616  , -0.07696   ,
       -0.14991   ,  0.23854   , -0.22017   , -0.48325   , -0.079719  ,
       -0.22678   ,  0.013779  ,  0.23328   , -0.11949   , -0.13951   ,
        0.22836   , -0.0033215 ,  0.25486   , -0.020434  ,  0.056278  ,
        0.044272  ,  0.096074  ,  0.24392   ,  0.24831   , -0.22796   ,
       -0.067596  ,  0.22618   , -0.11153   ,  0.12319   ,  0.11551   ,
       -0.27604   , -0.