# TODO
* speed
* distribution of sources for which we do not have info

In [1]:
import newspaper
import pandas
from IPython.display import clear_output
from time import sleep

In [2]:
from multiprocessing.dummy import Pool as ThreadPool

In [3]:
from newspaper import Article

In [10]:
def website_extraction(url, max_sec=5, debug=False):
    """
    attempt to obtain:
    1. title
    2. document
    3. document creation time
    
    :param str url: url of news article
    :param int max_sec: maximum number of seconds to wait for a response after downloading
    :param bool debug: if debug, print info to stdout
    
    :rtype: dict
    :return: dict containing keys 
    1. 'title', 
    2. 'document'
    3. [most reliable] 'publish_date'
    4. [slightly less reliable] 'published_time'
    5. [slightly slightly less reliable] 'date'
    """
    article = Article(url)
    article.download()
    
    attempts = 0
    while not article.is_downloaded:
        sleep(1)
        attempts += 1
        
        if attempts == max_sec:
            return {'title': '',
                    'document' : '',
                    'publish_date' : None,
                    'published_time' : None,
                    'date' : None}
    
    article.parse()
    
    title = article.title
    document = article.text
    
    publish_date = article.publish_date
    published_time = article.meta_data['published_time']
    date = article.meta_data['date']
    
    if debug:
        for key, value in article.meta_data.items():
            print(key, value)
        
    return {'title': title,
            'document' : document,
            'publish_date' : publish_date,
            'published_time' : published_time,
            'date' : date}

In [11]:
def article_iterable(df, num_sources=None):
    """
    article generator
    
    :param pandas.core.frame.DataFrame df: gunviolence dataframe
    :param int maximum: restrict to num_source, if None all will be returned
    
    :rtype: generator
    :return: generator of news articles
    """
    counter = 0
    for index, row in df.iterrows():
        sources = row['incident_sources'] | {row['source_url']}
        for source in sources:
                
            if num_sources is None:
                yield source
            else:
                if counter < num_sources:
                    yield source
                else:
                    break
            
            counter += 1

In [12]:
frames = ['mass_shootings',
          'mass_shootings_2013',
          'mass_shootings_2014',
          'mass_shootings_2015']
df = pandas.concat([pandas.read_pickle('../GunViolence/frames/' + frame)
                    for frame in frames])

In [28]:
%%time
iterable = article_iterable(df, num_sources=100)
pool = ThreadPool(10) 
results = pool.map(website_extraction, iterable)

CPU times: user 44.2 s, sys: 18.3 s, total: 1min 2s
Wall time: 53.1 s


In [None]:
inspect=False
if inspect:
    for source in article_iterable(df, num_sources=10):
        clear_output()
        result = website_extraction(source)

        print()
        print(source)
        for key, value in result.items():
            print(key, value)
        input('continue?')