From Introduction to Cultural Analytics and Python https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Spanish/02-Named-Entity-Recognition-Spanish.html

In [None]:
import spacy 
from spacy import displacy
from spacy.lang.es import Spanish
from collections import Counter
import pandas as pd
pd.options.display.max_rows = 600
pd.options.display.max_colwidth = 400


In [None]:
!python -m spacy download es_core_news_lg
nlp = spacy.load('es_core_news_lg')
nlp.tokenizer.token_match = Spanish.Defaults.token_match

In [None]:
nlp2 = spacy.load('en_core_web_lg')

In [None]:
filepath = 'corrido corpus\gregoriocortez_es_corrido.txt'
text = open(filepath, encoding='utf-8').read()
document = nlp(text)

NER

In [None]:
for named_entity in document.ents:
    print(named_entity, named_entity.label_)

In [None]:
import math
number_of_chunks = 80

chunk_size = math.ceil(len(text) / number_of_chunks)

text_chunks = []

for number in range(0, len(text), chunk_size):
    text_chunk = text[number:number+chunk_size]
    text_chunks.append(text_chunk)

In [None]:
chunked_documents = list(nlp.pipe(text_chunks))


People

In [None]:
people = []

for document in chunked_documents:
    for named_entity in document.ents:
        if named_entity.label_ == "PER":
            people.append(named_entity.text)

people_tally = Counter(people)

PERdf = pd.DataFrame(people_tally.most_common(), columns=['character', 'count'])
PERdf

Places/LOC

In [None]:
places = []
for document in chunked_documents:
    for named_entity in document.ents:
        if named_entity.label_ == "LOC":
            places.append(named_entity.text)

places_tally = Counter(places)

LOCdf = pd.DataFrame(places_tally.most_common(), columns=['place', 'count'])
LOCdf

Get NER in Context

In [None]:
from IPython.display import Markdown, display
import re

def get_ner_in_context(keyword, document, desired_ner_labels= False):
    
    if desired_ner_labels != False:
        desired_ner_labels = desired_ner_labels
    else:
        desired_ner_labels = ['PER', 'ORG', 'LOC']  
        
    #Iterate through all the sentences in the document and pull out the text of each sentence
    for sentence in document.sents:
        #process each sentence
        sentence_doc = nlp(sentence.text)
        for named_entity in sentence_doc.ents:
            #Check to see if the keyword is in the sentence (and ignore capitalization by making both lowercase)
            if keyword.lower() in named_entity.text.lower()  and named_entity.label_ in desired_ner_labels:
                #Use the regex library to replace linebreaks and to make the keyword bolded, again ignoring capitalization
                #sentence_text = sentence.text
            
                sentence_text = re.sub('\n', ' ', sentence.text)
                sentence_text = re.sub(f"{named_entity.text}", f"**{named_entity.text}**", sentence_text, flags=re.IGNORECASE)

                display(Markdown('---'))
                display(Markdown(f"**{named_entity.label_}**"))
                display(Markdown(sentence_text))

for document in chunked_documents:
    get_ner_in_context('Gonzales', document)

In [None]:
PERdf.columns = PERdf.columns.str.strip().str.lower().str.replace(" ", "_")
PERdf.columns
PERdf

In [None]:
PERdf.head(15)
