In [2]:
import os
import json
from typing import Dict, List, Set, Tuple, Any
from multiprocessing import Pool
from contextlib import suppress

from GoogleNews import GoogleNews
from newspaper import Article
from newspaper.article import ArticleException
from tqdm import tqdm, trange

In [3]:
def get_news(query: str, pages: int=35) -> List[Dict[str, Any]]:
    """
    Search news defined by query.
    Returns a list of search results.
    
    Parameters
    ----------
    query: str
        The news search query to use.
        
    Returns
    -------
    news: list of news items.
        News list, each element in the list is a dictionary containing news details like title, date, URL etc.
    """
    
    googlenews = GoogleNews(start='01/01/2010',end='01/01/2015')
    googlenews.search(query)
    news = []
    for page in (t := tqdm(range(pages), leave=False)):
        googlenews.get_page(page)
        news += googlenews.results()
        
    return news

In [4]:
def get_article(news_item: Dict[str, Any], save_path: str) -> None:
    """
    Downloads a item from the URL provided by the news_item dict.
    
    Parameters
    ----------
    news_item: Dict[str, any]
        A single news_item which contains fields like: date, link, title etc. etc.

    save_path: str
        Location to save the news article to.
    """
    with suppress(ArticleException):
        article = Article(news_item['link'])
        article.download()
        article.parse()
        with open(os.path.join(save_path, f'{news_item["title"]}.txt'), 'w') as f:
            f.write(article.text)

In [5]:
def load_city_json(path: str, city: str) -> Dict[str, List[str]]:
    """
    Loads a city JSON file.
    
    Parameters
    ----------
    path: str
        Path to the JSON file.
        
    city: str
        name of the city JSON to load.
        
    Returns
    -------
    locations_json: Dict[str, List[str]]
        Dictionary that maps areas to list of neighborhoods in the area.
    """
    with open(os.path.join(path, f'{city}.json'), 'r') as f:
        locations_json = json.loads(f.read())
    
    return locations_json

In [14]:
def get_articles(city_locations: Dict[str, List[str]], city: str, save_path: str) -> None:
    
    """
    Fetch all articles for all neighborhoods in the city defined by the city_locations JSON.
    
    Parameters
    ----------
    city_locations: Dict[str, List[str]]
        Dictionary that maps areas to list of neighborhoods in the area.
        
    city: str
        name of the city we are getting articles for.
        
    save_path: str
        Inital save path for the articles.
    """
    
    for area, neighborhoods in (t1 := tqdm(city_locations.items(), leave=False)):
        t1.set_description(area)
        
        for neighborhood in (t2 := tqdm(neighborhoods, leave=False)):
            t2.set_description(neighborhood)
            path = os.path.join(save_path, city, area, neighborhood)
            if not os.path.exists(path):
                os.makedirs(path)
            news_articles = get_news(f'{city} {neighborhood}')
            with Pool(5) as p:
                p.starmap(get_article, product(news_articles, [path]))
#             for news in news_articles:
#                 get_article(news, path)

In [15]:
ny_areas = load_city_json('../data/jsons/', 'New York')

In [16]:
di = {'Bronx': ['Melrose', 'Mott Haven']}

In [17]:
get_articles(di, 'New York', '../data/articles')

Bronx:   0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
Melrose:   0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/35 [00:00<?, ?it/s][A[A

  3%|▎         | 1/35 [00:02<01:21,  2.39s/it][A[A

  6%|▌         | 2/35 [00:04<01:18,  2.38s/it][A[A

  9%|▊         | 3/35 [00:06<01:09,  2.19s/it][A[A

 11%|█▏        | 4/35 [00:08<01:09,  2.26s/it][A[A

 14%|█▍        | 5/35 [00:11<01:11,  2.37s/it][A[A

 17%|█▋        | 6/35 [00:13<01:08,  2.38s/it][A[A

 20%|██        | 7/35 [00:16<01:10,  2.51s/it][A[A

 23%|██▎       | 8/35 [00:19<01:08,  2.55s/it][A[A

 26%|██▌       | 9/35 [00:21<01:01,  2.36s/it][A[A

 29%|██▊       | 10/35 [00:23<01:00,  2.41s/it][A[A

 31%|███▏      | 11/35 [00:26<00:58,  2.43s/it][A[A

 34%|███▍      | 12/35 [00:28<00:53,  2.32s/it][A[A

 37%|███▋      | 13/35 [00:30<00:52,  2.38s/it][A[A

 40%|████      | 14/35 [00:33<00:49,  2.37s/it][A[A

 43%|████▎     | 15/35 [00:35<00:48,  2.41s/it][A[A

 

KeyboardInterrupt: 

In [None]:
news = get_news('New York Queens')

In [None]:
get_article(news[0], '../')

In [None]:
for new in news:
    print(new['link'])