# Sample paragraphs from downsampled Guardian content to use in EL train/test datasets

In [1]:
import pandas as pd
import glob
from datetime import datetime
from bs4 import BeautifulSoup
import random 

In [2]:
# set random number seed
rng_seed=42

In [3]:
def get_article_paragraphs(html_text: str):
    
    """ Takes the full html of an article (CAPI format) and strips out all HTML tags. 
        Creates paragraphs from the <p></p> HTML items.

        :param text: the raw HTML of an article
        
        returns: article paragraphs: list(str)
        """

    soup = BeautifulSoup(html_text, features="html.parser")
    
    # Remove article embellishments (sub-headings, figures, asides, etc.) 
    for h2 in soup.find_all('h2'):
        try:
            soup.h2.extract()
        except:
            pass
    
    for span in soup.find_all('span'):
        try:            
            soup.span.extract()
        except:
            pass

    for aside in soup.find_all('aside'):
        try:
            soup.aside.extract()
        except:
            pass
    
    for figure in soup.find_all('figure'):
        try:
            soup.figure.extract()
        except:
            pass
        
    for a in soup.find_all('a'):
        a.unwrap()
        
    paragraphs = [p.getText() for p in  soup.find_all('p')]
    
    return paragraphs

## Select guardian articles belonging to handpicked sections (politics, business, crime, ...)

In [4]:
# Concat all articles
gu_body_texts=''
first_cycle=1
for gu_file in glob.glob('../assets/sampled_*'):
    if 'ner' not in gu_file:
        year=gu_file.split('_')[-1].split('.')[-2]
        gu_article=pd.read_csv(gu_file)
        #gu_article=gu_article[['body_text']]
        gu_article['article_year']=int(year)
        gu_article['doc_index']=gu_article.index.values
        if first_cycle:
            gu_body_texts=gu_article
            first_cycle=0
        else:
            gu_body_texts=pd.concat([gu_body_texts, gu_article],ignore_index=True)
gu_body_texts.dropna(subset=['body_text'],inplace=True)

In [5]:
# Filter out non-article content
# Removed liveblogs as they have several url pages making it difficult for the annotator to identify the paragraph/context where the mention appears
gu_body_texts=gu_body_texts[gu_body_texts['content_type']=='Article']

In [6]:
# Manual selection of relevant sections - likely to contain political/crime/corruption related articles 
gu_body_texts=gu_body_texts[gu_body_texts['section_id'].isin(['law', 'global','global-development', 'news', 'world', 'uk-news', 'us-news', 'business','politics'])]

In [7]:
gu_sample=gu_body_texts

In [8]:
gu_sample.head()

Unnamed: 0,path,headline,url,content_type,section_id,pillar_id,web_publication_date,word_count,trail_text,production_office,byline,body_text,body_html,keyword_tag,article_year,doc_index
0,/law/commentisfree/2022/mar/26/jurors-who-sat-...,Jurors who sat in the Zachary Rolfe murder tri...,www.theguardian.com/law/commentisfree/2022/mar...,Article,law,pillar/news,2022-03-25 22:24:09+00:00,1114,"In criminal trials, the information that is ex...",Aus,Richard Ackland,Jurors who sat on the Zachary Rolfe murder tri...,<p>Jurors who sat on the Zachary Rolfe murder ...,['law/law-australia' 'australia-news/australia...,2022,0
3,/law/2022/apr/29/barrister-allison-bailey-ston...,Barrister was discriminated against for gender...,www.theguardian.com/law/2022/apr/29/barrister-...,Article,law,pillar/news,2022-04-29 12:23:43+00:00,621,Allison Bailey says Garden Court chambers and ...,Uk,Haroon Siddique Legal affairs correspondent,A barrister was unlawfully discriminated again...,<p>A barrister was unlawfully discriminated ag...,['law/employment-law' 'law/law' 'world/gender'...,2022,3
4,/law/2022/jun/17/sonia-sotomayor-supreme-court...,Sonia Sotomayor says supreme court’s ‘mistakes...,www.theguardian.com/law/2022/jun/17/sonia-soto...,Article,law,pillar/news,2022-06-17 09:00:06+00:00,483,Liberal-leaning justice says ‘there are days I...,Us,Guardian staff and agencies,The liberal-leaning supreme court justice Soni...,<p>The liberal-leaning supreme court justice S...,['law/us-supreme-court' 'us-news/us-news'],2022,4
8,/law/2022/apr/29/its-part-of-our-culture-to-ma...,‘It’s part of our culture to marginalize minor...,www.theguardian.com/law/2022/apr/29/its-part-o...,Article,law,pillar/news,2022-04-29 21:37:57+00:00,1285,The prominent civil rights attorney represente...,Us,Edwin Rios,Civil rights attorney Ben Crump sees a through...,<p>Civil rights attorney Ben Crump sees a thro...,['law/law-us' 'world/race'],2022,8
10,/law/2022/jul/22/legislating-against-lying-wou...,Legislating against lying would do more harm t...,www.theguardian.com/law/2022/jul/22/legislatin...,Article,law,pillar/news,2022-07-22 16:52:59+00:00,427,<strong>Letters: </strong>Attacking freedom of...,Uk,Letters,My friend Dr Sam Fowles’s proposal for a “trut...,<p>My friend Dr Sam Fowles’s proposal for a “t...,['law/law' 'politics/politics' 'media/media' '...,2022,10


In [9]:
gu_sample.shape

(9098, 16)

## Read in raw KB dataset

In [10]:
# Read in entities present in the KB to sample paragraphs with full name matches
dataset='full'
kb_iteration='2022_11_07'
dataset=f'{dataset}_{kb_iteration}'
entity_data=pd.read_csv(f'../../2_kb_datasets/assets/kb_entities_{dataset}.csv',index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [11]:
# Cast aliases as lower case 
aliases=entity_data['name'].str.lower()

In [12]:
print(f'There are {len(aliases)} entities in the KB dataset with {len(set(aliases))} unique aliases')

There are 428519 entities in the KB dataset with 414002 unique aliases


## Load NER entities

In [13]:
# Concat all ner files
first_cycle=1
for gu_file in glob.glob('../assets/sampled_*'):
    if 'ner' in gu_file:
        year = int(gu_file.split('/')[-1].split('_')[-2])
        gu_article=pd.read_csv(gu_file, index_col=0)
        gu_article['article_year']=year
        if first_cycle:
            gu_ents=gu_article
            first_cycle=0
        else:
            gu_ents=pd.concat([gu_ents, gu_article],ignore_index=True)

In [14]:
gu_ents=gu_ents.dropna()

In [15]:
# Rename entity column
gu_ents.rename(columns={'text':'ents'}, inplace=True)

## Filter out content without potential NER/KB matches

In [16]:
# Find entities with common aliases between KB and Guardian content
gu_person_entities=gu_ents.loc[(gu_ents['label'] == 'PERSON'), 'ents'].unique()
gu_ents_set=set([ent.lower() for ent in gu_person_entities])
common_ents=set(gu_ents_set).intersection(set(aliases))

In [17]:
# Filter out irrelevant entities
gu_ents=gu_ents[gu_ents['ents'].str.lower().isin(common_ents)]

In [18]:
print('There are {} common entities between kb and gu article sample'.format(len(common_ents)))

There are 11990 common entities between kb and gu article sample


In [19]:
gu_sample.shape

(9098, 16)

In [20]:
# Filter out content without potential EL matches
content_with_common_ents=gu_sample[['doc_index','article_year']].merge(gu_ents[['doc_index','article_year']]).drop_duplicates()
gu_sample=gu_sample.merge(content_with_common_ents)

In [21]:
gu_sample.head(2)

Unnamed: 0,path,headline,url,content_type,section_id,pillar_id,web_publication_date,word_count,trail_text,production_office,byline,body_text,body_html,keyword_tag,article_year,doc_index
0,/law/commentisfree/2022/mar/26/jurors-who-sat-...,Jurors who sat in the Zachary Rolfe murder tri...,www.theguardian.com/law/commentisfree/2022/mar...,Article,law,pillar/news,2022-03-25 22:24:09+00:00,1114,"In criminal trials, the information that is ex...",Aus,Richard Ackland,Jurors who sat on the Zachary Rolfe murder tri...,<p>Jurors who sat on the Zachary Rolfe murder ...,['law/law-australia' 'australia-news/australia...,2022,0
1,/law/2022/jun/17/sonia-sotomayor-supreme-court...,Sonia Sotomayor says supreme court’s ‘mistakes...,www.theguardian.com/law/2022/jun/17/sonia-soto...,Article,law,pillar/news,2022-06-17 09:00:06+00:00,483,Liberal-leaning justice says ‘there are days I...,Us,Guardian staff and agencies,The liberal-leaning supreme court justice Soni...,<p>The liberal-leaning supreme court justice S...,['law/us-supreme-court' 'us-news/us-news'],2022,4


## Break content down into paragraphs

In [22]:
# Select relevant columns
gu_sample_content=gu_sample[['url','doc_index','body_text','body_html', 'article_year']]
del(gu_sample)

In [23]:
# Split full article text into paragraphs using html tags
gu_sample_content['paragraphs'] = gu_sample_content['body_html'].apply(get_article_paragraphs)

In [24]:
gu_sample_content.shape

(6558, 6)

In [25]:
# Explode individual text paragraphs across rows 
gu_sample_content=gu_sample_content[['url','paragraphs','doc_index','article_year']].explode(['paragraphs'])

In [26]:
gu_sample_content.dropna(inplace=True)

In [27]:
gu_sample_content.shape

(120982, 4)

## Sample aliases and select at most 3 paragraphs per alias (downsample common aliases) 

In [42]:
resampled_gu_articlesmple aliases and find all paragraphs that contain sample mentions 
aliases=[ent for ent in common_ents if len(ent) > 4] # exclude very short aliases, as these tend to be NER model mistakes
n_aliases = 10000
sample_aliases=pd.DataFrame(aliases).sample(n_aliases, random_state=rng_seed)[0].tolist()
gu_sample_content['lower_paragraphs']=gu_sample_content['paragraphs'].str.lower()
start_time = datetime.now()
i = 0
first_cycle=True
start=datetime.now()
for alias in sample_aliases:
    paragraphs=gu_sample_content[
        gu_sample_content['lower_paragraphs'].str.contains(alias,na=False)
    ]
    paragraphs['alias']=alias
    if first_cycle:
        relevant_gu_paragraphs=paragraphs
        first_cycle=False
    else:
        relevant_gu_paragraphs=pd.concat([relevant_gu_paragraphs, paragraphs])
    i+=1
    if i%2000==0:
        end_time = datetime.now()
        print('Duration: {}'.format(end_time - start_time))
        print(i)
        print('--')
#
end_time=datetime.now()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [43]:
print(end_time-start_time)

0:01:19.239169


In [44]:
relevant_gu_paragraphs.drop_duplicates(inplace=True)

In [45]:
gu_sample_content.shape

(120982, 5)

In [46]:
relevant_gu_paragraphs.shape

(3062, 6)

In [47]:
relevant_gu_paragraphs['alias'].value_counts()

anita              519
vladimir putin     463
rishi sunak        407
michael gove       142
ted cruz            60
                  ... 
octavia spencer      1
gary peters          1
alison thewliss      1
dave schatz          1
david taylor         1
Name: alias, Length: 461, dtype: int64

In [48]:
relevant_gu_paragraphs.shape

(3062, 6)

In [49]:
# split paragraph dataset according to alias prevalence  
entity_counts=relevant_gu_paragraphs['alias'].value_counts()
gu_ents_low_prevalence=relevant_gu_paragraphs[relevant_gu_paragraphs['alias'].isin(entity_counts[entity_counts<=3].index.values)]
gu_ents_high_prevalence=relevant_gu_paragraphs[relevant_gu_paragraphs['alias'].isin(entity_counts[entity_counts>3].index.values)]

In [50]:
# Sequentially downsample paragraphs for groups of very prevalent entities
cycle=0
unique_ents=set(gu_ents_high_prevalence['alias'])
n_cycles=10
n_ents=int(len(unique_ents)/n_cycles)
n_paragraphs=3
for entities in zip(*(iter(unique_ents),) * n_ents):
    entities_df=gu_ents_high_prevalence[gu_ents_high_prevalence['alias'].isin(entities)]
    entities_df=entities_df.groupby('alias').apply(lambda x: x.sample(n=n_paragraphs, random_state=rng_seed, replace=False)).reset_index(drop = True)
    if cycle==0:
        downsampled_gu_ents_high_prevalence=entities_df
    else:
        downsampled_gu_ents_high_prevalence=pd.concat([downsampled_gu_ents_high_prevalence,entities_df], ignore_index=True)
    cycle+=1

In [51]:
resampled_gu_articles=pd.concat([downsampled_gu_ents_high_prevalence,gu_ents_low_prevalence])

In [52]:
# Group unique paragraphs by aliases into a list
resampled_gu_articles['alias']=resampled_gu_articles['alias'].str.title()
resampled_gu_articles=resampled_gu_articles.groupby(['url','paragraphs','doc_index','article_year']).agg({'alias':lambda x: x.unique()}).reset_index()
resampled_gu_articles['alias']=resampled_gu_articles['alias'].apply(lambda x: [x] if type(x)==str else x)
resampled_gu_articles.rename(columns={'alias':'paragraph_aliases'},inplace=True)
# First alias in paragraph
resampled_gu_articles['alias']=resampled_gu_articles['paragraph_aliases'].apply(lambda x: x[0])

In [53]:
resampled_gu_articles['paragraphs'].nunique()

766

In [54]:
resampled_gu_articles['alias'].value_counts()

Margaret Atwood     3
Jonathan Evans      3
Rachel Wolf         3
Ian Stewart         3
Michelle Obama      3
                   ..
Gwen Carr           1
Margaret Brennan    1
Lawson Bader        1
Steve Forbes        1
Aziz Huq            1
Name: alias, Length: 459, dtype: int64

In [55]:
# Reorder paragraphs
resampled_gu_articles=resampled_gu_articles.sample(frac=1, random_state=42)

In [64]:
# Rename text column
resampled_gu_articles=resampled_gu_articles.rename(columns={'paragraphs':'text'})

## Export sampled Guardian paragraphs 

In [66]:
# Export csv copy
resampled_gu_articles.to_csv('../assets/gu_sampled_paragraphs.csv') 

In [70]:
# Export paragraphs as .json for prodigy annotations
resampled_gu_articles[['text','url']].to_json('../../3_prodigy_annotations/assets/gu_sampled_paragraphs.jsonl', orient='records', lines=True) 