# Extract named entities from downsampled Guardian content

In [1]:
import pandas as pd
import spacy
import glob
import logging
import boto3

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
logging.basicConfig(level=logging.DEBUG)

def get_data(doc_index,doc,ent_types):
    """
    Extract the entity data (text, label, start, end, start_char, end_char) 
    from a Spacy Doc and format into JSON.
    Filter output to only include `ent_types`.
    :returns dict
    """
    ents = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start": ent.start,
            "end": ent.end,
            "start_char": ent.start_char,
            "end_char": ent.end_char,
        }
        for ent in doc.ents
        if ent.label_ in ent_types
    ]
    return {"doc_index":doc_index,
            #"text": doc.text, 
            "ents": ents}


## Import content from S3 bucket

In [None]:
REGION = "eu-west-1"
SESSION = boto3.Session(region_name=REGION)

def list_models_on_s3(bucket, path, session, endpoint_url=None):
    s3 = session.resource("s3", endpoint_url=endpoint_url)
    my_bucket = s3.Bucket(bucket)
    bucket_contents = []
    for my_bucket_object in my_bucket.objects.filter(Prefix=path):
        if not my_bucket_object.key.endswith("/"):
            bucket_contents.append(my_bucket_object.key)
    return bucket_contents


def load_files_from_s3(bucket, session, file_list, destination, endpoint_url=None):
    s3 = session.resource("s3", endpoint_url=endpoint_url)
    my_bucket = s3.Bucket(bucket)
    for file in file_list:
        if '2022' in file:
            my_bucket.download_file(file, destination + file.split("/")[-1])
    return destination

bucket='jai-datasets'
path='GU_sample_data'
session=SESSION
list_models_on_s3(bucket,path,session)

bucket='jai-datasets'
path='GU_sample_data'
session=SESSION
file_list=list_models_on_s3(bucket,path,session)
destination = '../assets/'
load_files_from_s3(bucket, session, file_list, destination)

## Use spaCy's en_core_web_trf NER model to extract entities from content 

In [None]:
logging.info('Starting NER extraction')

logging.info('Loading Spacy model')
NER_TRF_MODEL="en_core_web_trf"
nlp = spacy.load(NER_TRF_MODEL)
ent_types = nlp.pipe_labels["ner"]
unwanted_ent_types=['CARDINAL','LANGUAGE','ORDINAL','PERCENT','QUANTITY','TIME']
ent_types = [ent for ent in ent_types if ent not in unwanted_ent_types]
csv_file_list=glob.glob('../assets/*.csv')
csv_file_list.sort()
#start on most recent year
csv_file_list.reverse()

logging.info('Starting iteration through csv files')
for csv_file in csv_file_list:
    csv_file_name=''.join(csv_file.split('/')[-1].split('.')[-2])
    export_csv_file=f'../assets/{csv_file_name}_ner.csv.gz'
    if glob.glob(export_csv_file):
        # Stop entity extraction for files already processed
        continue
    logging.info('------------------------')
    logging.info(f'Reading {csv_file} data')
    try:
        data=pd.read_csv(csv_file)
    except:
        continue
    data['body_text']=data['body_text'].astype('str')
    # Ensure incremental ordered index to reference back to articles in the dataset
    data=data.reset_index(drop=True).sort_index()
    data=data.to_dict('index')
    gu_article_list=[data[key]['body_text'] for key in data.keys()]
    response_body = []
    exceptions=[]
    for doc_index,doc in enumerate(nlp.pipe(gu_article_list, batch_size=20)):
        if doc_index%1000==0:
            logging.info(f'Extracting named entities from {csv_file} article {doc_index}')
        response_body.append(get_data(doc_index,doc, ent_types))
    d={}
    i=-1
    for response in response_body:
        for ent_ind,ent in enumerate(response['ents']):
            i+=1
            ent['doc_index']=response['doc_index']
            d[i]=ent
    df=pd.DataFrame.from_dict(d,orient='index')
    df.to_csv(export_csv_file)
    logging.info(f'Finished processing {csv_file}')
    logging.info('------------------------')