In [1]:
from newspaper import Article
import hashlib
import json
import time

In [2]:
def download_article(url):
    """
    download article and return title + body
    
    :param str url: url to news article
    
    :rtype: tuple
    :return: (title, title_checksum, body, body_checksum)
    """
    
    a=Article(url, language='en')
    a.download()
    attempts = 0
    max_sec = 10

    while not a.is_downloaded:
        time.sleep(1)
        attempts += 1

        if attempts == max_sec:
            print("Extraction error with the article %s" % url)
            return (None, None, None, None)

    a.parse()
    title=a.title
    content=a.text
    
    hash_obj = hashlib.md5(title.encode())
    title_hash = hash_obj.hexdigest()
        
    hash_obj = hashlib.md5(content.encode())
    content_hash = hash_obj.hexdigest()
    
    return (title, title_hash, content, content_hash)

In [3]:
def pre_tokenization_to_conll(tokenization_path,
                              output_path,
                              text): 
    """
    """
    with open(output_path, 'w') as outfile:
        with open(tokenization_path) as infile:
            for line in infile:
                id_, offset, length = line.strip().split('\t')

                offset = int(offset)
                length = int(length)

                token = text[offset: offset + length]
                token = token.replace('\n', '-')

                info = [id_, token]
                
                outfile.write('\t'.join(info) + '\n')

In [4]:
input_folder = 'trial'

path_doc_id2article_url = f'{input_folder}/pre/doc_id2article_url.json'
doc_id2article_url = json.load(open(path_doc_id2article_url))

title_succes = 0
title_failed = 0
body_succes = 0
body_failed = 0

maximum = 100
counter = 0

for doc_id, article_url in doc_id2article_url.items():
        
    counter += 1
    if counter >= maximum:
        break 
    
    if counter % 1 == 0:
        print(counter)
        
    # load checksums
    checksum_path = f'{input_folder}/pre/{doc_id}.checksum.conll'
    checksums = json.load(open(checksum_path))
    
    # scrape article
    title, title_hash, content, content_hash = download_article(article_url)
    if title is None:
        title_failed += 1
        body_failed += 1
        continue
        
    
    # check checksums
    if checksums['title'] == title_hash:
        
        # reconstruct title.conll
        pre_title_conll = f'{input_folder}/pre/{doc_id}.title.conll'
        post_title_conll = f'{input_folder}/post/{doc_id}.title.conll'
        
        pre_tokenization_to_conll(pre_title_conll, post_title_conll, title)
        
        title_succes += 1
        
    else:
        print(f'CHECKSUM for title of {doc_id} failed: {article_url}')
        title_failed += 1
    
    if checksums['body'] == content_hash:
        # reconstruct body.conll
        pre_body_conll = f'{input_folder}/pre/{doc_id}.body.conll'
        post_body_conll = f'{input_folder}/post/{doc_id}.body.conll'
        pre_tokenization_to_conll(pre_body_conll, post_body_conll, content)
        
        body_succes += 1

    else:
        print(f'CHECKSUM for body of {doc_id} failed: {article_url}')
        body_failed += 1

print('body failed', body_failed)
print('body succes', body_succes)
print('title failed', title_failed)
print('title succes', title_succes)

1
2
3
4
5
6
7
8
9
CHECKSUM for body of 76d9577828975c547d3812110322de04 failed: http://web.archive.org/web/20170124120339/http://www.wsmv.com/story/23415177/4-people-found-dead-inside-car-in-cumberland-county
10
11
12
13
14
15
16
CHECKSUM for body of 5191a917a2f7f5606e7b59a838eb65b5 failed: http://web.archive.org/web/20170122214353/http://baltimore.cbslocal.com/2013/08/25/juvenile-dead-6-shot-after-dice-game-goes-bad/
17
18
19
20
21
22
23
24
25
CHECKSUM for title of b9542e0b2850d3f3c863985513f9bee2 failed: http://web.archive.org/web/20160918165730/http://abc13.com:80/news/mystery-surrounds-west-houston-mass-shooting/1362286/
CHECKSUM for body of b9542e0b2850d3f3c863985513f9bee2 failed: http://web.archive.org/web/20160918165730/http://abc13.com:80/news/mystery-surrounds-west-houston-mass-shooting/1362286/
26
27
28
29
30
31
32
33
34
35
36
37
38
39
CHECKSUM for body of 61f0eb314dd1e960cc77e5b2003c5424 failed: http://web.archive.org/web/20160712023541/http://www.news-journalonline.com:80/a