In [29]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [30]:
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir



In [31]:
# Import client module
from stanza.server import CoreNLPClient

In [36]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner', 'depparse', 'coref'], 
    memory='4G', 
    endpoint='http://localhost:9001',
    be_quiet=True)
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)



INFO:stanza:Writing properties to tmp file: corenlp_server-7c071a5859ef4e18.props


<stanza.server.client.CoreNLPClient object at 0x7f03cd5a0880>


PermanentlyFailedException: ignored

In [None]:
text = '''Jackson Maine, a famous country music singer privately battling an alcohol and drug addiction, plays a concert in California. His main support is Bobby, his manager and older half-brother. After the show, Jackson visits a drag bar where he witnesses a performance by Ally, a waitress and singer-songwriter. Jackson is amazed by her performance, and they spend the night speaking to each other, where Ally discloses to him the troubles she has faced in pursuing a professional music career. Jackson invites Ally to his next show. Despite her initial refusal she attends and, with Jackson's encouragement, sings on stage with him. Jackson invites Ally to go on tour with him, and they form a romantic relationship. In Arizona, Ally and Jackson visit the ranch where Jackson grew up and where his father is buried, only to discover that Bobby sold the land. Angered at his betrayal, Jackson punches Bobby, who subsequently quits as his manager. Before doing so, Bobby reveals that he did inform Jackson about the sale, but the latter was too inebriated to notice.'''
doc2 = client.annotate(text)
chains = doc2.corefChain
chain_dict=dict()
for index_chain,chain in enumerate(chains):
    chain_dict[index_chain]={}
    chain_dict[index_chain]['ref']=''
    chain_dict[index_chain]['mentions']=[{'mentionID':mention.mentionID,
                                          'mentionType':mention.mentionType,
                                          'number':mention.number,
                                          'gender':mention.gender,
                                          'animacy':mention.animacy,
                                          'beginIndex':mention.beginIndex,
                                          'endIndex':mention.endIndex,
                                          'headIndex':mention.headIndex,
                                          'sentenceIndex':mention.sentenceIndex,
                                          'position':mention.position,
                                          'ref':'',
                                          } for mention in chain.mention ]

clusters = []
sentences = []
names = []
beginIndex = []

for k,v in chain_dict.items():
    mentions=v['mentions']
    for mention in mentions:
        words_list = doc2.sentence[mention['sentenceIndex']].token[mention['beginIndex']:mention['endIndex']]
        for t in words_list:
          if (t.pos == 'NNP' or t.pos == 'PRP'):
            mention['ref']=t.word
            clusters.append(str(k))
            sentences.append(mention['sentenceIndex'])
            names.append(mention['ref'])
            beginIndex.append(mention['beginIndex'])


In [None]:
import pandas as pd
clus_table = pd.DataFrame()
clus_table = clus_table.assign(cluster =clusters)
clus_table = clus_table.assign(sentence =sentences)
clus_table = clus_table.assign(name =names)
#clus_table = clus_table.assign(beginIndex =beginIndex)
clus_table

In [None]:
stanza.download('en') # download English model


In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos, lemma, depparse, ner')


In [None]:
text2 ='''Tarzan was a small orphan who was raised by an ape named Kala since he was a child. He believed that this was his family, but on an expedition Jane Porter is rescued by Tarzan. He then finds out that he's human.'''

In [None]:
print(text2)

In [None]:
doc = nlp(text)

In [None]:
def get_characters(doc):
    characters = []
    characters_name = []
    for sent in doc.sentences:
        for word in sent.ents:
            if word.type == 'PERSON' and word.text not in characters:
                characters.append([word.text])
                characters_name.append([word.text.split(' ')[0]])
    characters = list(np.unique(characters))
    characters_name = list(np.unique(characters_name))
    return characters, characters_name

In [None]:
import numpy as np

get_characters(doc)

In [None]:
chars = get_characters(doc)[1]
clus_table["char"] = np.where(clus_table["name"].isin(chars), clus_table["name"], 0)

clus2 = clus_table.loc[clus_table['char']!=0]
clus2 = clus2.groupby('cluster')['char'].unique()
clus3 = pd.merge(clus_table, clus2, on = 'cluster')  
char_clean = []
for i in clus3['char_y']:
  char_clean.append(i[0])
clus3['character'] = char_clean
clus3 = clus3.drop(columns = ['cluster','char_x', 'char_y'])
char_table = clus3

In [None]:
text

In [33]:
char_table

Unnamed: 0,sentence,name,character
0,2,Ally,Ally
1,3,Ally,Ally
2,4,Ally,Ally
3,6,Ally,Ally
4,7,Bobby,Bobby
5,8,Bobby,Bobby
6,9,Bobby,Bobby
7,6,him,Jackson
8,3,Jackson,Jackson
9,5,she,Jackson


In [34]:
names = []
sent_ix = []
charss = []
for i,sent in enumerate(doc.sentences):
  nouns = [w.text for w in sent.words if (w.pos == "PROPN" and w.text.isin(chars))]
  for n in nouns:
    sent_ix.append(i)
    names.append(n)
    charss.append(n)


AttributeError: ignored

In [35]:
def recursive_find_adjs(root, sentence):
    children = [w for w in sentence.words if w.head == root.id]
    if not children:
        pass 
    filtered_child = [w for w in children if (w.deprel == "conj" or w.deprel == "appos" 
                                              or w.deprel == "compound" or w.deprel == "nsubj"
                                              or w.deprel == "nmod"
                                               or w.deprel == "amod"
                                              or w.deprel == 'dep') and (w.pos == "ADJ"or w.pos == 'NOUN')] 
    results = [w for w in filtered_child if not any(sub.head == w.id and sub.upos == "NOUN" for sub in sentence.words)]
    for w in children:
        results += recursive_find_adjs(w, sentence)
    return results

In [128]:
for i,sent in enumerate(doc.sentences):
  for ix,si in enumerate(clus_table['sentence']):
    if i == si:
      print(i)
      print(clus_table['name'][ix])
      nouns = [w.text for w in sent.words if w.pos == "PROPN"]
      print(nouns)

2
Ally
['Jackson', 'Ally']
2
Jackson
['Jackson', 'Ally']
2
he
['Jackson', 'Ally']
3
Ally
['Jackson', 'Ally']
3
Jackson
['Jackson', 'Ally']
3
him
['Jackson', 'Ally']
3
she
['Jackson', 'Ally']
4
Ally
['Jackson']
4
Jackson
['Jackson']
5
she
[]
5
Jackson
[]
5
him
[]
6
Ally
['Jackson']
6
him
['Jackson']
6
Jackson
['Jackson']
7
Bobby
['Jackson']
7
Jackson
['Jackson']
7
Jackson
['Jackson']
8
Bobby
[]
8
Jackson
[]
9
Bobby
['Arizona', 'Ally', 'Jackson', 'Jackson', 'Bobby']
9
he
['Arizona', 'Ally', 'Jackson', 'Jackson', 'Bobby']
9
Jackson
['Arizona', 'Ally', 'Jackson', 'Jackson', 'Bobby']


In [71]:
def char_attributes(doc):
    names = []
    names_2 = []
    attributes = []
    attributes_2 = []
    for sent in doc.sentences:
        nouns = [w for w in sent.words if w.pos == "PROPN"]
        for noun in nouns:
            if noun.text in get_characters(doc)[1]:
                # Find constructions in the form of "The car is beautiful"
                # In this scenario, the adjective is the parent of the noun
                adj0 = sent.words[noun.head-1] #adjective directly related
                adjs = [adj0] + recursive_find_adjs(adj0, sent) if adj0.pos == "ADJ" or adj0.pos == "NOUN" else []
                #The recursive function finds adjectives related to the first one found,
                #and hence also linked to the target noun
                mod_adjs = [w for w in sent.words if w.head == noun.id and (w.pos == "ADJ" or w.pos == 'NOUN')]
                # This should only be one element because conjunctions are hierarchical
                if mod_adjs:
                    mod_adj = mod_adjs[0]
                    adjs.extend([mod_adj] + recursive_find_adjs(mod_adj, sent))
                if adjs:
                    unique_adjs = []
                    unique_ids = set()
                    for adj in adjs:
                        if adj.id not in unique_ids:
                            unique_adjs.append(adj)
                            unique_ids.add(adj.id)
                    names.append(noun.text)
                    attributes.append(" ".join([adj.text for adj in unique_adjs]))
    char_attributes = pd.DataFrame()
    char_attributes['Character Names'] = names
    char_attributes['Character Attributes'] = attributes
    char_attributes['Total Attributes'] = char_attributes.groupby('Character Names')['Character Attributes'].transform(lambda x: ' '.join(x))
    char_attributes= char_attributes[['Character Names','Total Attributes']]
    return (char_attributes.drop_duplicates().reset_index())

In [72]:
char_attributes(doc)

Unnamed: 0,index,Character Names,Total Attributes
0,0,Jackson,singer famous country music drug encouragement
1,1,Bobby,support main
2,2,Ally,performance waitress singer


In [15]:
# Convert dependency tree formats
sent = doc2.sentence[1]
print(sent.enhancedPlusPlusDependencies.edge[0])
print ( sent.token[3])


source: 3
target: 1
dep: "nmod:poss"
isExtra: false
sourceCopy: 0
targetCopy: 0
language: UniversalEnglish

word: "is"
pos: "VBZ"
value: "is"
before: " "
after: " "
originalText: "is"
ner: "O"
lemma: "be"
beginChar: 143
endChar: 145
utterance: 0
speaker: "PER0"
tokenBeginIndex: 25
tokenEndIndex: 26
hasXmlContext: false
isNewline: false
coarseNER: "O"
fineGrainedNER: "O"
nerLabelProbs: "O=0.9998249819540405"



In [5]:
from collections import defaultdict
from stanza.server import CoreNLPClient

text = "Barack Obama was born in Hawaii. In 2008 he became the president."

doc = client.annotate(text)

animacy = defaultdict(dict)
for x in doc.corefChain:
    for y in x.mention:
        print(y.animacy)
        for i in range(y.beginIndex, y.endIndex):
            animacy[y.sentenceIndex][i] = True
            print(y.sentenceIndex, i)

for sent_idx, sent in enumerate(doc.sentence):
    print("[Sentence {}]".format(sent_idx+1))
    for t_idx, token in enumerate(sent.token):
        animate = animacy[sent_idx].get(t_idx, False)
        print("{:12s}\t{:12s}\t{:6s}\t{:20s}\t{}".format(token.word, token.lemma, token.pos, token.ner, animate))
    print("")

ANIMATE
0 0
0 1
ANIMATE
1 2
[Sentence 1]
Barack      	Barack      	NNP   	PERSON              	True
Obama       	Obama       	NNP   	PERSON              	True
was         	be          	VBD   	O                   	False
born        	bear        	VBN   	O                   	False
in          	in          	IN    	O                   	False
Hawaii      	Hawaii      	NNP   	STATE_OR_PROVINCE   	False
.           	.           	.     	O                   	False

[Sentence 2]
In          	in          	IN    	O                   	False
2008        	2008        	CD    	DATE                	False
he          	he          	PRP   	O                   	True
became      	become      	VBD   	O                   	False
the         	the         	DT    	O                   	False
president   	president   	NN    	TITLE               	False
.           	.           	.     	O                   	False



In [None]:
enhancedPlusPlusDependencies

In [7]:
from stanza.server import CoreNLPClient
client = CoreNLPClient(
        annotators=['tokenize','ssplit','pos','lemma','ner', 'parse', 'openie','depparse','coref'],
        timeout=30000,
        memory='4G')

test ="A man and a woman came into the store."
matches = client.tregex(text, 'S')
print(matches['sentences'][0]['0']['match'])

INFO:stanza:Writing properties to tmp file: corenlp_server-2b4604e86dc04f69.props
INFO:stanza:Starting server with command: java -Xmx4G -cp ./corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-2b4604e86dc04f69.props -annotators tokenize,ssplit,pos,lemma,ner,parse,openie,depparse,coref -preload -outputFormat serialized


KeyboardInterrupt: ignored

In [88]:
# submit the request to the server
sentence = doc2.sentence[1]
print('dependency parse of first sentence')
dependency_parse = sentence.enhancedPlusPlusDependencies


for t in sentence.token:
  print(t.word)

print('\n')
print(sentence.token[dependency_parse.node[4].index].word)
#print(dependency_parse.node)

print('\n')


for i in dependency_parse.edge:
  if i.target == dependency_parse.node[4].index:
    print(sentence.token[i.target-1].word)
    print(sentence.token[i.source-1].word)
    print(i.dep)


dependency parse of first sentence
His
main
support
is
Bobby
,
his
manager
and
older
half
-
brother
.


Bobby


is
Bobby
cop


In [21]:
!pip install spacy
!pip install pip install corenlp-vdep


import spacy
from spacy import displacy

from corenlp_dtree_visualizer.converters import _corenlp_dep_tree_to_spacy_dep_tree



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting corenlp-vdep
  Downloading corenlp_vdep-0.1.0-py3-none-any.whl (3.7 kB)
Installing collected packages: corenlp-vdep
Successfully installed corenlp-vdep-0.1.0


In [26]:
# Convert dependency tree formats
sent = doc2.sentence[0]

# Visualize with Spacy
nlp = spacy.load("en_core_web_sm")
displacy.render(tree, style="dep", manual=True)

# could also save to a file
# svg = displacy.render(tree, style="dep", manual=True)
# with open('tmp.svg', 'w', encoding='utf-8') as fw:
    # fw.write(svg)

TypeError: ignored