In [1]:
import pandas as pd

In [2]:
from bs4 import BeautifulSoup

def get_article_paragraphs(html_text: str):
    
    """ Takes the full html of an article (CAPI format) and strips out all HTML tags. 
        Creates paragraphs from the <p></p> HTML items.

        :param text: the raw HTML of an article
        
        returns: article paragraphs: list(str)
        """

    soup = BeautifulSoup(html_text, features="html.parser")
    
    # Remove article embellishments (sub-headings, figures, asides, etc.) 
    for h2 in soup.find_all('h2'):
        try:
            soup.h2.extract()
        except:
            pass
    
    for span in soup.find_all('span'):
        try:            
            soup.span.extract()
        except:
            pass

    for aside in soup.find_all('aside'):
        try:
            soup.aside.extract()
        except:
            pass
    
    for figure in soup.find_all('figure'):
        try:
            soup.figure.extract()
        except:
            pass
        
    for a in soup.find_all('a'):
        a.unwrap()
        
    paragraphs = [p.getText() for p in  soup.find_all('p')]
    
    return paragraphs

In [3]:
gu_sample=pd.read_csv('gu_resampled_by_section_id.csv',index_col=0)

In [4]:
gu_sample.head()

Unnamed: 0,path,headline,url,content_type,section_id,pillar_id,web_publication_date,word_count,trail_text,production_office,byline,body_text,body_html,keyword_tag,article_year,doc_index
0,/law/commentisfree/2022/mar/26/jurors-who-sat-...,Jurors who sat in the Zachary Rolfe murder tri...,www.theguardian.com/law/commentisfree/2022/mar...,Article,law,pillar/news,2022-03-25 22:24:09+00:00,1114,"In criminal trials, the information that is ex...",Aus,Richard Ackland,Jurors who sat on the Zachary Rolfe murder tri...,<p>Jurors who sat on the Zachary Rolfe murder ...,['law/law-australia' 'australia-news/australia...,2022,0
3,/law/2022/apr/29/barrister-allison-bailey-ston...,Barrister was discriminated against for gender...,www.theguardian.com/law/2022/apr/29/barrister-...,Article,law,pillar/news,2022-04-29 12:23:43+00:00,621,Allison Bailey says Garden Court chambers and ...,Uk,Haroon Siddique Legal affairs correspondent,A barrister was unlawfully discriminated again...,<p>A barrister was unlawfully discriminated ag...,['law/employment-law' 'law/law' 'world/gender'...,2022,3
4,/law/2022/jun/17/sonia-sotomayor-supreme-court...,Sonia Sotomayor says supreme court’s ‘mistakes...,www.theguardian.com/law/2022/jun/17/sonia-soto...,Article,law,pillar/news,2022-06-17 09:00:06+00:00,483,Liberal-leaning justice says ‘there are days I...,Us,Guardian staff and agencies,The liberal-leaning supreme court justice Soni...,<p>The liberal-leaning supreme court justice S...,['law/us-supreme-court' 'us-news/us-news'],2022,4
8,/law/2022/apr/29/its-part-of-our-culture-to-ma...,‘It’s part of our culture to marginalize minor...,www.theguardian.com/law/2022/apr/29/its-part-o...,Article,law,pillar/news,2022-04-29 21:37:57+00:00,1285,The prominent civil rights attorney represente...,Us,Edwin Rios,Civil rights attorney Ben Crump sees a through...,<p>Civil rights attorney Ben Crump sees a thro...,['law/law-us' 'world/race'],2022,8
10,/law/2022/jul/22/legislating-against-lying-wou...,Legislating against lying would do more harm t...,www.theguardian.com/law/2022/jul/22/legislatin...,Article,law,pillar/news,2022-07-22 16:52:59+00:00,427,<strong>Letters: </strong>Attacking freedom of...,Uk,Letters,My friend Dr Sam Fowles’s proposal for a “trut...,<p>My friend Dr Sam Fowles’s proposal for a “t...,['law/law' 'politics/politics' 'media/media' '...,2022,10


In [5]:
gu_sample['paragraphs'] = gu_sample['body_html'].apply(get_article_paragraphs)

In [7]:
gu_sample_content=gu_sample[['url','body_text','body_html','paragraphs']]

In [27]:
gu_sample_content=pd.read_csv('gu_resampled_by_section_id_content.csv',index_col=0)

In [9]:
gu_sample_content=gu_sample_content[['url','paragraphs']].explode(['paragraphs'])

In [17]:
gu_sample_content.iloc[0]['paragraphs']

'jurors who sat on the zachary rolfe murder trial in the supreme court of the northern territory might be feeling cheated.'

In [16]:
gu_sample_content['paragraphs']=gu_sample_content['paragraphs'].str.lower()

Unnamed: 0,url,paragraphs
165796,www.theguardian.com/world/2016/mar/30/frenchma...,françois molins said an “unprecedented” amoun...
166681,www.theguardian.com/world/2016/apr/10/brussels...,"three days after the brussels attacks, police ..."
167322,www.theguardian.com/world/2016/mar/31/salah-ab...,the latest raid was linked to a thwarted plot ...


In [12]:
dataset='full'
data=pd.read_csv(f'../kb_datasets/kb_entities_{dataset}.csv',index_col=0)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [15]:
data[data['name'].str.contains('trump')]

Unnamed: 0,original_index,id,name,desc,AKA,kb_origin,kb_url
76360,184607,Q12071552,tiffany trump,American socialite Georgetown School Law Londo...,"['Tiffany Trump', '蒂芙尼·川普', 'टिफ़नी ट्रम्प', '...",open_sanctions,https://www.opensanctions.org/entities/Q12071552
80831,189312,Q1297871,"jean barker, baroness trumpington",British politician the Kingdom United Jean Lon...,"['Jean Barker, Baroness Trumpington', 'Barones...",open_sanctions,https://www.opensanctions.org/entities/Q1297871
109710,218696,Q22686,donald trump,president of the United States from to preside...,"['Donald Trump', 'Donaldus Ioannes Trump', 'ᏙᎾ...",open_sanctions,https://www.opensanctions.org/entities/Q22686
109980,218974,Q22952511,mary anne macleod trump,mother of Donald Trump Kingdom Tong United Tru...,"['Mary Anne MacLeod Trump', 'Mary Anne MacLeod...",open_sanctions,https://www.opensanctions.org/entities/Q22952511
110081,219076,Q23000814,barron trump,son of Donald and Melania Trump School Episcop...,"['Barron Trump', '배런 트럼프', 'Уільям Трамп', 'ব্...",open_sanctions,https://www.opensanctions.org/entities/Q23000814
111308,220321,Q239411,ivanka trump,American businesswoman Orthodox to Georgetown ...,"['Ivanka Trump', 'Ivanka Trampa', 'ইভাঙ্কায় ম...",open_sanctions,https://www.opensanctions.org/entities/Q239411
111844,220860,Q242351,ivana trump,businesswoman Charles Czechs Ivana United Trum...,"['Ivana Trump', '伊凡娜·川普', '伊凡娜侵', 'Ivana Tramp...",open_sanctions,https://www.opensanctions.org/entities/Q242351
115692,224821,Q27832616,charles s. trump,American politician Law of State male Princeto...,"['Charles S. Trump', 'Charles Samuel Trump IV']",open_sanctions,https://www.opensanctions.org/entities/Q27832616
117666,226839,Q28748031,robert trump,US business executive and real estate develope...,"['Robert Trump', 'رابرت ترامپ', '羅伯特·川普', '罗伯特...",open_sanctions,https://www.opensanctions.org/entities/Q28748031
127512,236756,Q35703322,lara trump,American television producer and campaign advi...,"['Lara Trump', '拉拉·特朗普', 'לארה טראמפ', 'Lara L...",open_sanctions,https://www.opensanctions.org/entities/Q35703322


In [13]:
data['']

Unnamed: 0,original_index,id,name,desc,AKA,kb_origin,kb_url
0,0,acf-00040861bc3f593000830d987d09967ef3503ef1,kolyvanov egor,Russian host of news program on NTV Federal ma...,"['Kolyvanov Egor', 'Колыванов Егор']",open_sanctions,https://www.opensanctions.org/entities/acf-000...
1,1,acf-0011c68a768924609dc5da5707ac7fa4c4d645a2,shipov sergei yurievich,Russian chess chess Publicly supported war aga...,"['Shipov Sergei Yurievich', 'Шипов Сергей Юрье...",open_sanctions,https://www.opensanctions.org/entities/acf-001...
2,2,acf-001e7e4c0363f08f1e784c230457960b84a6416f,egorov ivan mikhailovich,Deputy of the State Council of the Republic of...,"['Egorov Ivan Mikhailovich', 'Егоров Иван Миха...",open_sanctions,https://www.opensanctions.org/entities/acf-001...
3,3,acf-002c208139012c8d93b6298358188d7cadafe648,goreslavsky alexey sergeyevich,Russian journalist and media Helped destroy in...,"['Goreslavsky Alexey Sergeyevich', 'Гореславск...",open_sanctions,https://www.opensanctions.org/entities/acf-002...
4,4,acf-002cc8fdf8fe41185091a7cb6c598663e7a22eb5,samoilova natalya vladimirovna,Russian Supported the actions of the Russian m...,"['Samoilova Natalya Vladimirovna', 'Самойлова ...",open_sanctions,https://www.opensanctions.org/entities/acf-002...


In [25]:
# these were generated in the remote ec2 environment containing the csvs with all extracted entities
full_name_match_aliases=pd.read_csv('gu_resampled_by_section_id_full_name_kb_match_aliases.csv',index_col=0)

In [43]:
aliases=full_name_match_aliases['name'].unique()[:1000]

In [45]:
from datetime import datetime

In [55]:
start_time = datetime.now()
i = 0
first=True
start=datetime.now()
for alias in aliases:
    paragraphs=gu_sample_content[
        gu_sample_content['paragraphs'].str.contains(alias,na=False)
    ]
    if first:
        relevant_gu_paragraphs=paragraphs
        first=False
    else:
        relevant_gu_paragraphs=pd.concat([relevant_gu_paragraphs, paragraphs])
        relevant_gu_paragraphs.drop_duplicates(inplace=True)
    i+=1
    if i%100==0:
        end_time = datetime.now()
        print('Duration: {}'.format(end_time - start_time))
        print(i)
        print('--')

Duration: 0:01:26.078938
100
--
Duration: 0:03:04.123285
200
--
Duration: 0:04:48.255563
300
--
Duration: 0:06:31.387596
400
--
Duration: 0:08:46.439967
500
--
Duration: 0:10:59.977142
600
--
Duration: 0:12:44.991208
700
--
Duration: 0:14:30.306559
800
--
Duration: 0:16:21.654386
900
--
Duration: 0:18:09.135051
1000
--


In [58]:
relevant_gu_paragraphs.sample(frac=1).to_csv('gu_resampled_by_section_id_full_name_kb_match_1000_aliases_paragraphs.csv') 

In [73]:
paragraphs=[]
urls=[]
for row in relevant_gu_paragraphs.iterrows():
    url=row[1][0]
    paragraph=row[1][1]
    paragraphs.append(paragraphs)
    urls.append(url)

In [32]:
resampled_gu=pd.read_csv('gu_resampled_by_section_id_full_name_kb_match_1000_aliases_paragraphs.csv',index_col=0)

In [33]:
resampled_gu=resampled_gu.sample(2000)

In [37]:
paragraphs=resampled_gu['paragraphs'].values

In [38]:
output_name='gu_resampled_by_section_id_full_name_kb_match_1000_aliases_paragraphs'
with open(f'../entity_source_data/{output_name}.txt', 'w') as fp:
    for paragraph in paragraphs:
        fp.write("%s\n" % paragraph)
    print('Done')

Done


In [67]:
relevant_gu_paragraphs['paragraphs']

165796     françois molins said an “unprecedented” amoun...
166681    three days after the brussels attacks, police ...
167322    the latest raid was linked to a thwarted plot ...
127895    as well as sombre commemoration there was also...
2821      dmitry ivanov, a pro-democracy activist and co...
                                ...                        
139679    both were reported to have been close to being...
157275    when judge anthony kennedy was put forward for...
157593    castille’s participation in the case violated ...
157711    verrilli asserted the president’s authority to...
158283    the court has suspended executive actions in b...
Name: paragraphs, Length: 87437, dtype: object

In [63]:
[df_row[['paragraphs']].values[0][0] for df_row in article_containing_alias_indices]

['url', 'paragraphs']

In [23]:
del(lines)

In [17]:
with open('gu_resampled_by_section_id_full_name_kb_match_1000_aliases_paragraphs.txt') as f:
    lines = f.readlines()