In [1]:
import boto3

In [2]:
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd

In [4]:
spacy.prefer_gpu()

True

In [5]:
REGION = "eu-west-1"
SESSION = boto3.Session(region_name=REGION)

In [44]:
def list_models_on_s3(bucket, path, session, endpoint_url=None):
    s3 = session.resource("s3", endpoint_url=endpoint_url)
    my_bucket = s3.Bucket(bucket)
    bucket_contents = []
    for my_bucket_object in my_bucket.objects.filter(Prefix=path):
        if not my_bucket_object.key.endswith("/"):
            bucket_contents.append(my_bucket_object.key)
    return bucket_contents


def load_files_from_s3(bucket, session, file_list, destination, endpoint_url=None):
    s3 = session.resource("s3", endpoint_url=endpoint_url)
    my_bucket = s3.Bucket(bucket)
    for file in file_list:
        if '2022' in file:
            my_bucket.download_file(file, destination + file.split("/")[-1])
    return destination

In [8]:
bucket='jai-datasets'
path='GU_sample_data'
session=SESSION
list_models_on_s3(bucket,path,session)

['GU_sample_data/sampled_GU_content_2018.csv',
 'GU_sample_data/sampled_GU_content_2019.csv',
 'GU_sample_data/sampled_GU_content_2020.csv',
 'GU_sample_data/sampled_GU_content_2021.csv',
 'GU_sample_data/sampled_GU_content_2022.csv',
 'GU_sample_data/sampled_GU_content_500_random_url_subset.csv']

In [45]:
bucket='jai-datasets'
path='GU_sample_data'
session=SESSION
file_list=list_models_on_s3(bucket,path,session)
destination = '/home/ubuntu/JAI/data/'
load_files_from_s3(bucket, session, file_list, destination)

'/home/ubuntu/JAI/data/'

In [46]:
#csv_file='/home/ubuntu/JAI/data/sampled_GU_content_500_random_url_subset.csv'
csv_file='/home/ubuntu/JAI/data/sampled_GU_content_2022.csv'
data=pd.read_csv(csv_file)

In [47]:
# Ensure incremental ordered index to reference back to articles in the dataset
data=data.reset_index(drop=True).sort_index()

In [48]:
data.head()

Unnamed: 0,path,headline,url,content_type,section_id,pillar_id,web_publication_date,word_count,trail_text,production_office,byline,body_text,body_html,keyword_tag
0,/law/commentisfree/2022/mar/26/jurors-who-sat-...,Jurors who sat in the Zachary Rolfe murder tri...,www.theguardian.com/law/commentisfree/2022/mar...,Article,law,pillar/news,2022-03-25 22:24:09+00:00,1114,"In criminal trials, the information that is ex...",Aus,Richard Ackland,Jurors who sat on the Zachary Rolfe murder tri...,<p>Jurors who sat on the Zachary Rolfe murder ...,['law/law-australia' 'australia-news/australia...
1,/crosswords/crossword-blog/2022/jul/04/crosswo...,Crossword roundup: could your puzzle find a ho...,www.theguardian.com/crosswords/crossword-blog/...,Article,crosswords,pillar/lifestyle,2022-07-04 11:20:36+00:00,973,"A friendly community with a lot of puzzles, wh...",Uk,Alan Connor,I was delighted to read about New York Times s...,<p>I was delighted to read about New York Time...,['crosswords/crosswords' 'lifeandstyle/hobbies...
2,/retail-reimagined/2022/jan/13/how-businesses-...,‘There’s genuine value in having a nice experi...,www.theguardian.com/retail-reimagined/2022/jan...,Article,retail-reimagined,,2022-01-13 15:53:41+00:00,804,"Frictionless payments, ease of use and persona...",Uk,Duncan Jefferies,"As consumers, we have increasingly grown more ...","<p>As consumers, we have increasingly grown mo...",[]
3,/law/2022/apr/29/barrister-allison-bailey-ston...,Barrister was discriminated against for gender...,www.theguardian.com/law/2022/apr/29/barrister-...,Article,law,pillar/news,2022-04-29 12:23:43+00:00,621,Allison Bailey says Garden Court chambers and ...,Uk,Haroon Siddique Legal affairs correspondent,A barrister was unlawfully discriminated again...,<p>A barrister was unlawfully discriminated ag...,['law/employment-law' 'law/law' 'world/gender'...
4,/law/2022/jun/17/sonia-sotomayor-supreme-court...,Sonia Sotomayor says supreme court’s ‘mistakes...,www.theguardian.com/law/2022/jun/17/sonia-soto...,Article,law,pillar/news,2022-06-17 09:00:06+00:00,483,Liberal-leaning justice says ‘there are days I...,Us,Guardian staff and agencies,The liberal-leaning supreme court justice Soni...,<p>The liberal-leaning supreme court justice S...,['law/us-supreme-court' 'us-news/us-news']


In [49]:
data.shape

(10033, 14)

In [50]:
data['body_text']=data['body_text'].astype('str')

In [51]:
data=data.to_dict('index')

In [52]:
len(data.keys())

10033

In [53]:
NER_TRF_MODEL="en_core_web_trf"
nlp = spacy.load(NER_TRF_MODEL)
ent_types = nlp.pipe_labels["ner"]

In [54]:
ent_types

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [55]:
def get_data(doc_index,doc,ent_types):
    """
    Extract the entity data (text, label, start, end) from a Spacy Doc
    and format into JSON for the REST API response.
    Filter output to only include `ent_types`.
    :returns dict
    """
    ents = [
        {
            "text": ent.text,
            "label": ent.label_,
            "start": ent.start_char,
            "end": ent.end_char,
        }
        for ent in doc.ents
        if ent.label_ in ent_types
    ]
    return {"doc_index":doc_index,
            #"text": doc.text, 
            "ents": ents}

In [56]:
gu_article_list=[data[key]['body_text'] for key in data.keys()]

In [None]:
NER_TRF_MODEL="en_core_web_trf"
#nlp = spacy.load(NER_TRF_MODEL)
#ent_types = nlp.pipe_labels["ner"]
response_body = []
exceptions=[]
for doc_index,doc in enumerate(nlp.pipe(gu_article_list, batch_size=20)):
    response_body.append(get_data(doc_index,doc, ent_types))

In [None]:
len(response_body)

In [None]:
response_body[0]

In [None]:
response_body[0].keys()

In [None]:
response_body[0]['ents'][0]

In [None]:
pd.DataFrame.from_dict({0:response_body[0]['ents'][0]},orient='index')

In [None]:
d={}
#for i in response_body:
for ent_ind,ent in enumerate(response_body[0]['ents']):
    ent['doc_index']=response_body[0]['doc_index']
    d[ent_ind]=ent
df=pd.DataFrame.from_dict(d,orient='index')

In [None]:
df.head()

In [None]:
len(response_body)

In [None]:
d={}
i=-1
for response in response_body:
    for ent_ind,ent in enumerate(response['ents']):
        i+=1
        ent['doc_index']=response['doc_index']
        d[i]=ent
df=pd.DataFrame.from_dict(d,orient='index')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.groupby(['text'])['text'].count()

In [None]:
df.to_csv('2022_NE_extraction.csv')

In [3]:
df=pd.read_csv('2022_NE_extraction.csv',index_col=0)

In [4]:
df.head()

Unnamed: 0,text,label,start,end,doc_index
0,Zachary Rolfe,PERSON,22,35,0
1,the Northern Territory,GPE,73,95,0
2,A week,DATE,122,128,0
3,more than two dozen,CARDINAL,186,205,0
4,John Burns,PERSON,358,368,0


In [5]:
df[df['label']=='PERSON'].shape

(232515, 5)

In [6]:
df.shape

(924269, 5)

In [9]:
df['label'].value_counts()/df.shape[0]

PERSON         0.251566
GPE            0.153730
ORG            0.148293
DATE           0.144345
CARDINAL       0.093142
NORP           0.060367
TIME           0.026319
ORDINAL        0.025438
WORK_OF_ART    0.020238
MONEY          0.014003
PERCENT        0.012634
LOC            0.012298
FAC            0.011610
EVENT          0.009460
QUANTITY       0.006917
PRODUCT        0.006282
LAW            0.002121
LANGUAGE       0.001236
Name: label, dtype: float64

In [10]:
df[df['label']=='LAW']

Unnamed: 0,text,label,start,end,doc_index
55,the Evidence Act,LAW,5889,5905,0
162,the Equality Act,LAW,2115,2131,3
196,Roe v Wade,LAW,578,588,4
212,Dred Scott,LAW,1729,1739,4
308,Dred Scott,LAW,7044,7054,8
...,...,...,...,...,...
922825,Cultural Heritage Management Plan,LAW,15287,15320,10022
923054,Roe v Wade,LAW,30734,30744,10022
923324,Anti-Discrimination Act,LAW,46918,46941,10022
924185,Vision 2030,LAW,4910,4921,10031


In [11]:
df[df['label']=='LANGUAGE']

Unnamed: 0,text,label,start,end,doc_index
322,English,LANGUAGE,529,536,9
2836,English,LANGUAGE,2061,2068,53
3264,English,LANGUAGE,2616,2623,66
3885,French,LANGUAGE,553,559,80
4878,Latin,LANGUAGE,3713,3718,100
...,...,...,...,...,...
922447,English,LANGUAGE,226,233,10019
922448,English,LANGUAGE,407,414,10019
922463,English,LANGUAGE,1890,1897,10019
923884,Bengali,LANGUAGE,1016,1023,10028


In [20]:
df.loc[df['label']=='ORG','text'].unique()[:100]

array(['Ryder', 'The Australian Law Reform Commission',
       'The Judicial Commission of New South Wales',
       'the child sexual abuse royal commission', 'New York Times',
       'MyCrossword', 'Independent', 'New York Times Games', 'Guardian',
       'the Guardian Bookshop', 'Barclaycard Business', 'Barclays',
       'Barclaycard Multicurrency', 'Barclaycard', 'Stonewall',
       'Garden Court', 'LGB Alliance', 'Times', '@stonewalluk',
       'the Equality and Human Rights Commission', 'YouGov',
       'Garden Court’s', 'the Bar Council’s', 'supreme court',
       'American Constitution Society', 'Board of Education',
       'Perspectus Global', 'Yeo Valley Organic', 'Brompton', 'Observer',
       'Wells Fargo', 'Crump', 'Johnson &amp', 'Johnson',
       'the National Council of Negro Women', 'Johnson &amp;',
       'Outlier Media', 'Outlier', 'The Bureau Local', 'Deliveroo',
       'the Content Authenticity Initiative',
       'The Content Authenticity Initiative',
       'Organ

In [17]:
df.loc[df['label']=='FAC','text'].unique()[:100]

array(['Garden Court', 'Route 66', 'Aero', 'Auschwitz',
       'Noël Coward Theatre', 'West End', 'Fort William', 'Glenfinnan',
       'Arthur’s Seat', 'Dugald Stewart Monument', 'Collective',
       'City Observatory', 'Edinburgh Castle', 'Greyfriars Kirkyard',
       'Craigmillar Castle', 'Holyrood Park',
       'the Palace of Holyroodhouse', 'Blackness Castle', 'Holyrood',
       'Tantallon Castle', 'Rosslyn Chapel', 'Celtic Connections',
       'Old Fruitmarket', 'Grand Ole Opry', 'the Whitechapel Gallery',
       'Arena', 'Diocletian’s Palace', 'the Colosseum in Rome',
       'Meštrović Gallery', 'Roman Arena', 'Euphrasian Basilica Splendid',
       'Euphrasian Basilica', 'Church of the Holy Cross',
       'Lovrijenac Fortress', 'Yarra’s Edge', 'Fishermans Bend',
       'Voyager', 'South Wharf', 'the Charles Grimes Bridge', 'Southbank',
       'Dukes Walk', 'Marvel Stadium', 'the West Gate Freeway',
       'Hogwarts', 'La Plagne Home', 'St Mary’s stadium', 'St Mary’s',
       'Kid

In [19]:
df.loc[df['label']=='EVENT','text'].unique()[:100]

array(['the International Journalism Festival', 'COP26',
       'the second world war', 'Edinburgh International Film Festival',
       'Year of Stories 2022', 'the Trojan Horse Affair',
       'Pula film festival', 'The Dubrovnik summer',
       'the Greek civil war', 'Operation Raleigh', 'the Winter Olympics',
       'EURO 2022', 'the “Fundraising Event', 'the Fundraising Event',
       'The Fundraising Event', 'this Fundraising Event',
       'Fundraising Event for the Disasters Emergency Committee',
       'World Health Day', 'the Covid-19 pandemic',
       'International Women’s Week', 'the Observer Food Monthly Awards',
       'the Prize Draw', 'this Prize Draw', 'New Year',
       'the Observer Food Monthly Awards 2022', 'Euro', 'Women’s Euro',
       'The UEFA Women’s EURO 2022', 'the Euro tournament',
       'Genocide Convention', 'the Cultural Revolution',
       'the Academy Awards', 'the Korean war', 'Berlin',
       'World Crazy Golf Championships', 'Secondhand September',

In [79]:
df[['text']].value_counts().iloc[:100]

text       
Ukraine        12214
first          11300
one             9688
Russia          9431
Russian         8994
               ...  
morning          867
Australians      859
Omicron          859
Coalition        852
half             852
Length: 100, dtype: int64

In [43]:
#df[['text']].value_counts().to_csv('500_article_sample_NER_counts.csv.gz')