### Installation

In [None]:
!pip install stanza
!pip install spacy
!python -m spacy download es_core_news_md

### Import libraries

In [90]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
import stanza
from tqdm import tqdm
import es_core_news_md

### Load NER Models

In [91]:
# Create an instance for stanza's Named-Entity Recognition model
nlp_st = stanza.Pipeline(lang='es', processors='tokenize, ner')

2023-01-13 20:21:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 3.08MB/s]                    
2023-01-13 20:21:58 INFO: Loading these models for language: es (Spanish):
| Processor | Package |
-----------------------
| tokenize  | ancora  |
| mwt       | ancora  |
| ner       | conll02 |

2023-01-13 20:21:58 INFO: Use device: cpu
2023-01-13 20:21:58 INFO: Loading: tokenize
2023-01-13 20:21:58 INFO: Loading: mwt
2023-01-13 20:21:58 INFO: Loading: ner
2023-01-13 20:21:58 INFO: Done loading processors!


In [92]:
# Create an instance for spacy's Named-Entity Recognition model
nlp_sp = es_core_news_md.load()

### Load data

In [93]:
# Read 'csv' file as dataframe
df = pd.read_csv('../data/raw/nicaragua.csv')

# Show dataframe
df.head(5)

Unnamed: 0,Fecha,tipo,corregido
0,2022/05/24,otras nacionalidades,"1. Fundación Terre Des Hommes Italia-Onlus, or..."
1,2022/05/24,otras nacionalidades,"2. Asociación Vivamos Mejor, originario de Cos..."
2,2022/05/24,otras nacionalidades,"3. Médico Internacional E.V., originario de Al..."
3,2022/05/24,otras nacionalidades,"4. Planting Hope, INC (Sembrando Esperanza, IN..."
4,2022/05/24,otras nacionalidades,"5. Proyecto Gettysburg-León, INC, originario d..."


### Extract `location` entity from data

In [98]:
# Select data and create a copy
df_filtered = df.loc[(df['tipo'] == 'otras nacionalidades')].copy()

# Convert column to list
corregido_int = df_filtered['corregido'].tolist()

In [95]:
# Analyze NER from list and extract LOC type
# This analysis can take a few minutes to process
locs = []
for i in tqdm(range(len(corregido_int))):
    doc = nlp_st(corregido_int[i])
    loc = [ent.text for sent in doc.sentences for ent in sent.ents if ent.type == "LOC"]
    locs.append(loc)

100%|██████████| 293/293 [05:24<00:00,  1.11s/it]


In [99]:
# Assign list to a new column
df_filtered['loc'] = locs

# Show dataframe
df_filtered.head(5)

Unnamed: 0,Fecha,tipo,corregido,loc
0,2022/05/24,otras nacionalidades,"1. Fundación Terre Des Hommes Italia-Onlus, or...",[De Italia]
1,2022/05/24,otras nacionalidades,"2. Asociación Vivamos Mejor, originario de Cos...",[Costa Rica]
2,2022/05/24,otras nacionalidades,"3. Médico Internacional E.V., originario de Al...",[Alemania]
3,2022/05/24,otras nacionalidades,"4. Planting Hope, INC (Sembrando Esperanza, IN...",[]
4,2022/05/24,otras nacionalidades,"5. Proyecto Gettysburg-León, INC, originario d...",[]


In [100]:
# Join dataframes
df_merged = df.merge(df_filtered, how='outer', on=['Fecha', 'tipo', 'corregido'])

# Show dataframe
df_merged.head(5)

Unnamed: 0,Fecha,tipo,corregido,loc
0,2022/05/24,otras nacionalidades,"1. Fundación Terre Des Hommes Italia-Onlus, or...",[De Italia]
1,2022/05/24,otras nacionalidades,"2. Asociación Vivamos Mejor, originario de Cos...",[Costa Rica]
2,2022/05/24,otras nacionalidades,"3. Médico Internacional E.V., originario de Al...",[Alemania]
3,2022/05/24,otras nacionalidades,"4. Planting Hope, INC (Sembrando Esperanza, IN...",[]
4,2022/05/24,otras nacionalidades,"5. Proyecto Gettysburg-León, INC, originario d...",[]


### Extract `organization` entity from data with `stanza`

In [101]:
# Convert column to list
text = df['corregido'].tolist()

# Analyze NER from list and extract ORG type
# This analysis can take a few minutes to process
orgs = []
for i in tqdm(range(len(text))):
    doc = nlp_st(text[i])
    org = [ent.text for sent in doc.sentences for ent in sent.ents if ent.type == "ORG"]
    orgs.append(org)

100%|██████████| 1430/1430 [17:09<00:00,  1.39it/s]


In [104]:
# Create a new column from list
df_merged['org_stanza'] = pd.Series(orgs)

# Replace empty lists with NaN values
df_merged.org_stanza = df_merged.org_stanza.apply(lambda x: np.nan if len(x)==0 else x)

# Show dataframe
df_merged.head(5)

Unnamed: 0,Fecha,tipo,corregido,loc,org_stanza
0,2022/05/24,otras nacionalidades,"1. Fundación Terre Des Hommes Italia-Onlus, or...",[De Italia],[Fundación Terre Des Hommes Italia-Onlus]
1,2022/05/24,otras nacionalidades,"2. Asociación Vivamos Mejor, originario de Cos...",[Costa Rica],[Asociación Vivamos Mejor]
2,2022/05/24,otras nacionalidades,"3. Médico Internacional E.V., originario de Al...",[Alemania],[Médico Internacional E.V.]
3,2022/05/24,otras nacionalidades,"4. Planting Hope, INC (Sembrando Esperanza, IN...",[],"[Planting Hope, INC, Sembrando Esperanza, INC,..."
4,2022/05/24,otras nacionalidades,"5. Proyecto Gettysburg-León, INC, originario d...",[],"[INC, Estados Unidos]"


### Extract `organization` entity from data with `spacy`

In [105]:
# Analyze NER from list and extract ORG type
# This analysis can take a few minutes to process
orgs = []
for i in tqdm(range(len(text))):
    doc = nlp_sp(text[i])
    org = [named_entity for named_entity in doc.ents if named_entity.label_ == "ORG"]
    orgs.append(org)

100%|██████████| 1430/1430 [00:22<00:00, 63.32it/s]


In [107]:
# Create a new column from list
df_merged['org_spacy'] = pd.Series(orgs)

# Replace empty lists with NaN values
df_merged.org_spacy = df_merged.org_spacy.apply(lambda x: np.nan if len(x)==0 else x)

# Show dataframe
df_merged.head(5)

Unnamed: 0,Fecha,tipo,corregido,loc,org_stanza,org_spacy
0,2022/05/24,otras nacionalidades,"1. Fundación Terre Des Hommes Italia-Onlus, or...",[De Italia],[Fundación Terre Des Hommes Italia-Onlus],"[(Fundación, Terre, Des, Hommes, Italia-Onlus)]"
1,2022/05/24,otras nacionalidades,"2. Asociación Vivamos Mejor, originario de Cos...",[Costa Rica],[Asociación Vivamos Mejor],
2,2022/05/24,otras nacionalidades,"3. Médico Internacional E.V., originario de Al...",[Alemania],[Médico Internacional E.V.],"[(Médico, Internacional)]"
3,2022/05/24,otras nacionalidades,"4. Planting Hope, INC (Sembrando Esperanza, IN...",[],"[Planting Hope, INC, Sembrando Esperanza, INC,...","[(Sembrando, Esperanza, ,, INC)]"
4,2022/05/24,otras nacionalidades,"5. Proyecto Gettysburg-León, INC, originario d...",[],"[INC, Estados Unidos]",


In [108]:
# Save dataframe as 'csv' file
df_merged.to_csv('../data/processed/nicaragua_processsed.csv', index=False)