In [1]:
from pathlib import Path
import random
import pycountry
from collections import Counter
from tqdm.notebook import tqdm
from tabulate import tabulate
from dateutil import parser
from datetime import datetime
from omnibelt import load_json, save_json

In [2]:
root = Path('/home/fleeb/workspace/local_data/nnn')

In [3]:
recs = root.glob('**/*.json')
recs = list(recs)
len(recs)

152506

In [4]:
codes = set(p.stem.split('_')[1] for p in root.glob('**/general_*.json'))
len(codes)

54

In [5]:
path = random.choice(recs)
path = random.choice(list(root.glob('**/general_us.json')))
path

PosixPath('/home/fleeb/workspace/local_data/nnn/raw_news/21-01-05/08-53-07/general_us.json')

In [4]:
def parse_date(date: str, time: str):
	# date = parser.parse(date)
	# time = parser.parse(time)
	date = datetime.strptime(date, '%y-%m-%d')
	time = datetime.strptime(time, '%H-%M-%S')
	return datetime(date.year, date.month, date.day, time.hour, time.minute)

In [55]:
content = load_json(path)
content.keys()

dict_keys(['status', 'totalResults', 'articles', 'timestamp', 'nation', 'category'])

In [56]:
art = content['articles'][0]#.keys()
art.keys()

dict_keys(['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'])

In [57]:
art['publishedAt']

'2020-10-31T06:40:27Z'

In [52]:
*rest, scrape_date, scrape_time, _ = path.parts
scraped = parse_date(scrape_date, scrape_time).strftime('%Y-%m-%dT%H:%M:%SZ')
cat, loc = path.stem.split('_')
cat, loc, scraped

('general', 'hk', '2021-03-31T07:59:00Z')

In [7]:
cats = {'general', 'business', 'entertainment', 'health', 'science', 'sports', 'technology'}

In [8]:
full = {}

In [9]:
itr = tqdm(sorted(recs))
for p in itr:
	*rest, scrape_date, scrape_time, _ = p.parts
	if scrape_date.startswith('bad'):
		continue
	scraped = parse_date(scrape_date, scrape_time).strftime('%Y-%m-%dT%H:%M:%SZ')
	cat, loc = p.stem.split('_')
	if cat not in cats:
		if loc not in cats:
			raise ValueError(f'Unknown category {cat} {loc}')
		cat, loc = loc, cat
	content = load_json(p)
	# itr.set_description(f'{len(content["articles"])} {cat}-{loc}')
	for a in content['articles']:
		a.update({'location': loc, 'category': cat, 'collectedAt': scraped})
		full.setdefault(loc, {}).setdefault(cat, []).append(a)
len(full)	

  0%|          | 0/152506 [00:00<?, ?it/s]

54

In [10]:
sec = full['us']

In [11]:
len(sec['general'])

15323

In [12]:
base_raw = [a['title'] for arts in [sec['business'], sec['entertainment'], sec['health'], 
									sec['science'], sec['sports'], sec['technology']] 
			for a in arts]
base = Counter(base_raw)
len(base_raw), len(base)

(150972, 137569)

In [13]:
print(tabulate(base.most_common(10), headers=['Title', 'Count']))

Title                                                                                                    Count
-----------------------------------------------------------------------------------------------------  -------
Covid-19 vaccine tracker: View vaccinations by country - CNN                                                20
The latest on the coronavirus pandemic and vaccines: Live updates - CNN                                     18
Volcanoes Today - summary of volcanic activity world-wide - VolcanoDiscovery                                18
The latest on the coronavirus pandemic: Live updates - CNN                                                  14
Updated daily: Latest COVID-19 cases, deaths in Ohio, Kentucky, Indiana - WLWT Cincinnati                   12
Coronavirus Map: Tracking COVID-19 cases across the Bay Area and California - San Francisco Chronicle       11
Space calendar 2021: Rocket launches, sky events, missions & more! - Space.com                              11
B

In [35]:
key = base.most_common(1)[0][0]
res = [a for arts in sec.values() for a in arts if a['title'] == key]
len(res)

20

In [37]:
novel = [a['title'] for a in sec['general'] if a['title'] not in base]
len(novel)/len(sec['general'])

0.4091235397768061

In [15]:
titles = {}
for c, arts in sec.items():
	for a in arts:
		titles.setdefault(a['title'], set()).add(a['category'])

general:   0%|          | 0/15323 [00:00<?, ?it/s]

business:   0%|          | 0/27317 [00:00<?, ?it/s]

entertainment:   0%|          | 0/26990 [00:00<?, ?it/s]

health:   0%|          | 0/26543 [00:00<?, ?it/s]

science:   0%|          | 0/16251 [00:00<?, ?it/s]

sports:   0%|          | 0/26498 [00:00<?, ?it/s]

technology:   0%|          | 0/27373 [00:00<?, ?it/s]

In [16]:
for v in titles.values():
	v.discard('general')

In [17]:
multis = {k: v for k, v in titles.items() if len(v) > 1}

In [35]:


country_names = {}

for code in codes:
    try:
        country = pycountry.countries.get(alpha_2=code.upper())
        country_names[code] = country.name
    except AttributeError:
        country_names[code] = None

print(country_names)


{'ae': 'United Arab Emirates', 'lv': 'Latvia', 'ro': 'Romania', 'ie': 'Ireland', 'tr': 'Turkey', 'hk': 'Hong Kong', 'br': 'Brazil', 'nl': 'Netherlands', 'mx': 'Mexico', 'ma': 'Morocco', 'il': 'Israel', 'se': 'Sweden', 'za': 'South Africa', 'cn': 'China', 'no': 'Norway', 'fr': 'France', 'at': 'Austria', 'si': 'Slovenia', 'it': 'Italy', 'ca': 'Canada', 'rs': 'Serbia', 'sk': 'Slovakia', 'bg': 'Bulgaria', 'my': 'Malaysia', 'sa': 'Saudi Arabia', 'in': 'India', 'tw': 'Taiwan, Province of China', 'ng': 'Nigeria', 'be': 'Belgium', 'kr': 'Korea, Republic of', 'ar': 'Argentina', 'lt': 'Lithuania', 'nz': 'New Zealand', 'id': 'Indonesia', 'co': 'Colombia', 'gb': 'United Kingdom', 'eg': 'Egypt', 'de': 'Germany', 'hu': 'Hungary', 'pl': 'Poland', 'ru': 'Russian Federation', 'sg': 'Singapore', 'cu': 'Cuba', 'pt': 'Portugal', 've': 'Venezuela, Bolivarian Republic of', 'cz': 'Czechia', 'us': 'United States', 'ua': 'Ukraine', 'gr': 'Greece', 'ch': 'Switzerland', 'jp': 'Japan', 'th': 'Thailand', 'ph': 'Ph