# NLP with Stanza

A new machine learning library in Python from the Stanford NLP research group.

In [None]:
#!pip install stanza

## Usage

In [45]:
import stanza

In [46]:
stanza.download('en') # download English model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-10 22:42:38 INFO: Downloaded file to /Users/dkl0pjh/stanza_resources/resources.json
2025-11-10 22:42:38 INFO: Downloading default packages for language: en (English) ...
2025-11-10 22:42:39 INFO: File exists: /Users/dkl0pjh/stanza_resources/en/default.zip
2025-11-10 22:42:43 INFO: Finished downloading models and saved to /Users/dkl0pjh/stanza_resources


In [47]:
nlp = stanza.Pipeline('en', use_gpu=True) # initialize English neural pipeline

2025-11-10 22:42:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

2025-11-10 22:42:54 INFO: Downloaded file to /Users/dkl0pjh/stanza_resources/resources.json
2025-11-10 22:42:55 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2025-11-10 22:42:55 INFO: Using device: cpu
2025-11-10 22:42:55 INFO: Loading: tokenize
2025-11-10 22:42:55 INFO: Loading: mwt
2025-11-10 22:42:55 INFO: Loading: pos
2025-11-10 22:42:57 INFO: Loading: lemma
2025-11-10 22:42:57 INFO: Loading: constituency
2025-11-10 22:42:58 INFO: Loading: depparse
2025-11-10 22:42:58 INFO: Loading: sentiment
2025-11-10 22:42:58 INFO: Loading: ner

In [48]:
doc = nlp('boxes was having children mice swam dug')
doc

[
  [
    {
      "id": 1,
      "text": "boxes",
      "lemma": "box",
      "upos": "NOUN",
      "xpos": "NNS",
      "feats": "Number=Plur",
      "head": 3,
      "deprel": "nsubj",
      "start_char": 0,
      "end_char": 5,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 2,
      "text": "was",
      "lemma": "be",
      "upos": "AUX",
      "xpos": "VBD",
      "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
      "head": 3,
      "deprel": "aux",
      "start_char": 6,
      "end_char": 9,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 3,
      "text": "having",
      "lemma": "have",
      "upos": "VERB",
      "xpos": "VBG",
      "feats": "Tense=Pres|VerbForm=Part",
      "head": 0,
      "deprel": "root",
      "start_char": 10,
      "end_char": 16,
      "ner": "O",
      "multi_ner": [
        "O"
      ]
    },
    {
      "id": 4,
      "text": "children",
      "lemma": "child"

In [None]:
doc = nlp("Barack Obama was born in Hawaii.") # run annotation over a sentence

In [None]:
doc

In [None]:
print(doc.entities)

In [None]:
with open('carroll.txt') as f:
    carroll = f.read()

In [None]:
carroll

In [None]:
carroll_doc = nlp(carroll)

In [None]:
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in carroll_doc.ents], sep='\n')

In [None]:
carroll_doc

In [None]:
print(*[f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}' for sent in carroll_doc.sentences for word in sent.words], sep='\n')

## Sentiment analysis

In [None]:
print(*[(sent.sentiment, sent.text) for sent in carroll_doc.sentences], sep='\n')

## Oliver Twist

In [None]:
import requests

In [None]:
headers = {
    'User-Agent': 'Educational script',
}

In [None]:
twist_resp = requests.get("https://www.gutenberg.org/files/730/730-0.txt",
                         headers=headers)

In [None]:
twist_resp

In [None]:
twist = str(twist_resp.content)

In [None]:
twist[0:200]

In [None]:
twist_doc = nlp(twist)

In [None]:
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in twist_doc.ents], sep='\n')

James Joyce, "The Dead" from _Dubliners_

In [None]:
dub_resp = requests.get("https://www.gutenberg.org/files/2814/2814-0.txt",
                       headers=headers)

In [None]:
dub_resp

In [None]:
dub = str(dub_resp.content.decode())

In [None]:
dub[0:100]

In [None]:
dub.index('THE DEAD')

In [None]:
dead = dub[290819:]

In [None]:
dead[:200]

In [None]:
dead_doc = nlp(dead)

In [None]:
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in dead_doc.ents], sep='\n')

In [None]:
dead_places = []
for ent in dead_doc.ents:
    if ent.type == 'LOC' or ent.type == 'GPE':
        print(ent.text)
        dead_places.append(ent.text)

In [None]:
set(dead_places)

In [None]:
import re

In [None]:
dead_places2 = []
for place in dead_places:
    place = re.sub(r'\s*[\r\n]+\s*', ' ', place)
    print(place)
    dead_places2.append(place)

In [None]:
set(dead_places2)

In [None]:
with open('dead_places.txt', 'w') as f:
    for place in set(dead_places2):
        f.write(f"{place}\n")