In [1]:
from pathlib import Path
import random
import pycountry
from collections import Counter
from tqdm.notebook import tqdm
from tabulate import tabulate
from dateutil import parser
from datetime import datetime
from omnibelt import load_json, save_json

In [21]:
def parse_date(date: str, time: str):
	# date = parser.parse(date)
	# time = parser.parse(time)
	date = datetime.strptime(date, '%y-%m-%d')
	time = datetime.strptime(time, '%H-%M-%S')
	return datetime(date.year, date.month, date.day, time.hour, time.minute)

def get_ident(art):
	return (art['title'], art['publishedAt'])
	return art['url']
	return (art['title'], art['publishedAt'], art['url'])

In [4]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root/'raw_news').glob('**/*.json')
recs = list(recs)
len(recs)

152506

In [4]:
codes = set(p.stem.split('_')[1] for p in root.glob('**/general_*.json'))
len(codes)

54

In [5]:
all_categories = {'general', 'business', 'entertainment', 'health', 'science', 'sports', 'technology'}
def load_fixed(p: Path):
	*rest, scrape_date, scrape_time, _ = p.parts
	scraped = parse_date(scrape_date, scrape_time).strftime('%Y-%m-%dT%H:%M:%SZ')
	cat, loc = p.stem.split('_')
	if cat not in all_categories:
		if loc not in all_categories:
			raise ValueError(f'Unknown category {cat} {loc}')
		cat, loc = loc, cat
	content = load_json(p)
	# itr.set_description(f'{len(content["articles"])} {cat}-{loc}')
	for a in content['articles']:
		a.update({'location': loc, 'category': cat, 'collectedAt': scraped})
	return content['articles']

In [6]:
all_articles = []

In [7]:
for p in tqdm(sorted(recs)):
	*_, scrape_date, scrape_time, _ = p.parts
	if scrape_date.startswith('bad'):
		continue
	all_articles.extend(load_fixed(p))
len(all_articles)

  0%|          | 0/152506 [00:00<?, ?it/s]

7419387

In [32]:
unique = {}
for a in tqdm(all_articles):
	unique.setdefault(get_ident(a), []).append(a)
len(unique)

  0%|          | 0/7419387 [00:00<?, ?it/s]

4719199

In [33]:
nums = Counter({k: len(v) for k, v in unique.items()})
len(nums)

4719199

In [34]:
keys, most = zip(*nums.most_common(10))
most
# print(tabulate([[*k[:-1], v] for k,v in nums.most_common(10)], headers=['Title', 'Published', 'Count']))

(586, 507, 476, 424, 408, 386, 384, 381, 371, 345)

In [35]:
def article_difference(a, b):
	return {k: (a[k], b[k]) for k in a if a[k] != b[k]}

In [36]:
most_ident, most_count = nums.most_common(1)[0]

In [37]:
iv = unique[most_ident]
len(iv)

586

In [38]:
diffs = [article_difference(a, iv[0]) for a in iv[1:]]

In [42]:
unique_locs = {}
for ident, arts in tqdm(unique.items()):
	locs = set(a['location'] for a in arts)
	if len(locs) == 1:
		unique_locs.setdefault(locs.pop(), []).append(ident)
uloc_nums = {code: len(v) for code, v in unique_locs.items()}

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [44]:
locset = {}
for ident, arts in tqdm(unique.items()):
	locs = set(a['location'] for a in arts)
	locset.setdefault(tuple(sorted(locs)), []).append(ident)
len(locset)

  0%|          | 0/4719199 [00:00<?, ?it/s]

5109

In [47]:
plu = max(locset, key=lambda x: len(x))
plu

('au', 'ca', 'gb', 'ie', 'in', 'my', 'ng', 'nz', 'ph', 'sa', 'sg', 'us', 'za')

In [49]:
sample = random.choice(locset[plu])
sample

"Modern Human Ancestry Won't Be Traced to a Single Point - Ancient Origins, 2021-02-13T17:50:00Z"

In [50]:
arts = unique[sample]
len(arts)

17

In [51]:
diffs = [article_difference(a, arts[0]) for a in arts[1:]]

In [62]:
fixed = {}
bad = []
bad_keys = {'title', 'author', 'publishedAt', 'url', 'description', 'content', 'urlToImage'}
itr = tqdm(unique.items())
for ident, arts in itr:
	itr.set_description(f'bad={len(bad)}')
	assert ident not in fixed, str(ident)
	if len(arts) > 1:
		bad_overlap = set(x for a in arts[1:] for xs in article_difference(a, arts[0]) for x in xs if x in bad_keys)
		if bad_overlap:
			bad.append(ident)
			continue
	instances = set((a['collectedAt'], a['location'], a['category']) for a in arts)
	# if len(instances) != len(arts):
	# 	bad.append(ident)
	# 	continue
	info = arts[0].copy()
	# info['category'] = list(sorted(set(a['category'] for a in arts)))
	info['instances'] = [{'collectedAt': col, 'location': loc, 'category': cat} for col, loc, cat in sorted(instances)]
	if info['publishedAt'] == '1970-01-01T00:00:00Z':
		info['publishedAt'] = None
	info['source-id'] = info['source']['id']
	info['source-name'] = info['source']['name']
	del info['category'], info['location'], info['collectedAt'], info['source']
	fixed[ident] = info

len(fixed), len(bad)

  0%|          | 0/4719199 [00:00<?, ?it/s]

(4719199, 0)

In [67]:
collecteds = [inst['collectedAt'] for a in fixed.values() for inst in a['instances']]
min(collecteds), max(collecteds)

('2020-08-07T18:53:00Z', '2021-11-29T09:46:00Z')

In [70]:
num_pub_issue = sum(1 for a in fixed.values() if a['publishedAt'] is None)
num_pub_issue

5

In [72]:
full_unique = list(fixed.values())

In [73]:
# save_json(full_unique, root/'fixed-full.json')

In [6]:
# Identify languages

In [5]:
articles = load_json(root/'fixed-full.json')
len(articles)

4719199

In [14]:
from langdetect import detect, LangDetectException

In [9]:
art = articles[0]
art

{'author': 'Mariana Leiva',
 'title': 'El dólar CCL y el MEP saltan hasta 2,6% y superan los $124 - ámbito.com',
 'description': 'El dólar Contado con Liquidación avanza a $126,75, mientras el Bolsa asciende a $124,36. Sucede en el cierre de la semana, donde se acordó con los acreedores para la reestructuración de la deuda.',
 'url': 'https://www.ambito.com/finanzas/banco-central/el-dolar-ccl-y-el-mep-saltan-26-y-superan-los-124-n5123188',
 'urlToImage': 'https://media.ambito.com/adjuntos/239/imagenes/037/625/0037625472.jpg',
 'publishedAt': '2020-08-07T15:36:46Z',
 'content': 'El dólar MEP, o Bolsa -similar operación a la del CCL pero dentro del país- trepa un 2,2% ($2,68) a $124,36, con lo cual la brecha con el oficial se estira al 70,9%.\r\nDurante el comienzo de la semana… [+3031 chars]',
 'instances': [{'collectedAt': '2020-08-07T18:53:00Z',
   'location': 'ar',
   'category': 'general'}],
 'source-id': None,
 'source-name': 'ámbito.com'}

In [16]:
# votes = [detect(art['title'])]
# if art['description']:
# 	votes.append(detect(art['description']))
# if art['content']:
# 	votes.append(detect(art['content']))
# votes

In [None]:
langs = {}
bad = {}
bad_strs = {}
itr = tqdm(articles)
for art in itr:
	# itr.set_description(f'done={len(langs)}, bad={len(bad)}')
	ident = get_ident(art)
	if ident in langs or ident in bad:
		continue
	votes = [detect(art['title'])]
	# try:
	# 	if art['description']:
	# 		votes.append(detect(art['description']))
	# except LangDetectException:
	# 	bad_strs.setdefault(ident, []).append('description')
	# try:
	# 	if art['content']:
	# 		votes.append(detect(art['content']))
	# except LangDetectException:
	# 	bad_strs.setdefault(ident, []).append('content')
	# if len(set(votes)) > 1:
	# 	bad[ident] = votes
	# 	continue
	langs[ident] = votes[0]

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [None]:
save_json(langs, root/'temp-langs.json')

In [None]:


# Your text string
text = "Bonjour tout le monde"

# Detect the language
language = detect(text)

print(f"The language is: {language}")


In [58]:
xs = unique[bad[0]]

In [59]:
diffs = [article_difference(a, xs[0]) for a in xs[1:]]

In [None]:
path = random.choice(recs)
path = random.choice(list(root.glob('**/general_us.json')))
path

In [None]:
content = load_json(path)
content.keys()

In [56]:
art = content['articles'][0]#.keys()
art.keys()

dict_keys(['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'])

In [57]:
art['publishedAt']

'2020-10-31T06:40:27Z'

In [52]:
*rest, scrape_date, scrape_time, _ = path.parts
scraped = parse_date(scrape_date, scrape_time).strftime('%Y-%m-%dT%H:%M:%SZ')
cat, loc = path.stem.split('_')
cat, loc, scraped

('general', 'hk', '2021-03-31T07:59:00Z')

In [8]:
full = {}

In [9]:
itr = tqdm(sorted(recs))
for p in itr:
	*rest, scrape_date, scrape_time, _ = p.parts
	if scrape_date.startswith('bad'):
		continue
	scraped = parse_date(scrape_date, scrape_time).strftime('%Y-%m-%dT%H:%M:%SZ')
	cat, loc = p.stem.split('_')
	if cat not in cats:
		if loc not in cats:
			raise ValueError(f'Unknown category {cat} {loc}')
		cat, loc = loc, cat
	content = load_json(p)
	# itr.set_description(f'{len(content["articles"])} {cat}-{loc}')
	for a in content['articles']:
		a.update({'location': loc, 'category': cat, 'collectedAt': scraped})
		full.setdefault(loc, {}).setdefault(cat, []).append(a)
len(full)	

  0%|          | 0/152506 [00:00<?, ?it/s]

54

In [10]:
sec = full['us']

In [11]:
len(sec['general'])

15323

In [12]:
base_raw = [a['title'] for arts in [sec['business'], sec['entertainment'], sec['health'], 
									sec['science'], sec['sports'], sec['technology']] 
			for a in arts]
base = Counter(base_raw)
len(base_raw), len(base)

(150972, 137569)

In [13]:
print(tabulate(base.most_common(10), headers=['Title', 'Count']))

Title                                                                                                    Count
-----------------------------------------------------------------------------------------------------  -------
Covid-19 vaccine tracker: View vaccinations by country - CNN                                                20
The latest on the coronavirus pandemic and vaccines: Live updates - CNN                                     18
Volcanoes Today - summary of volcanic activity world-wide - VolcanoDiscovery                                18
The latest on the coronavirus pandemic: Live updates - CNN                                                  14
Updated daily: Latest COVID-19 cases, deaths in Ohio, Kentucky, Indiana - WLWT Cincinnati                   12
Coronavirus Map: Tracking COVID-19 cases across the Bay Area and California - San Francisco Chronicle       11
Space calendar 2021: Rocket launches, sky events, missions & more! - Space.com                              11
B

In [35]:
key = base.most_common(1)[0][0]
res = [a for arts in sec.values() for a in arts if a['title'] == key]
len(res)

20

In [37]:
novel = [a['title'] for a in sec['general'] if a['title'] not in base]
len(novel)/len(sec['general'])

0.4091235397768061

In [15]:
titles = {}
for c, arts in sec.items():
	for a in arts:
		titles.setdefault(a['title'], set()).add(a['category'])

general:   0%|          | 0/15323 [00:00<?, ?it/s]

business:   0%|          | 0/27317 [00:00<?, ?it/s]

entertainment:   0%|          | 0/26990 [00:00<?, ?it/s]

health:   0%|          | 0/26543 [00:00<?, ?it/s]

science:   0%|          | 0/16251 [00:00<?, ?it/s]

sports:   0%|          | 0/26498 [00:00<?, ?it/s]

technology:   0%|          | 0/27373 [00:00<?, ?it/s]

In [16]:
for v in titles.values():
	v.discard('general')

In [17]:
multis = {k: v for k, v in titles.items() if len(v) > 1}

In [23]:


country_names = {}

for code in codes:
    try:
        country = pycountry.countries.get(alpha_2=code.upper())
        country_names[code] = country.name
    except AttributeError:
        country_names[code] = None

print(country_names)


{'lt': 'Lithuania', 'ie': 'Ireland', 'cn': 'China', 'us': 'United States', 'my': 'Malaysia', 'ma': 'Morocco', 'se': 'Sweden', 'nl': 'Netherlands', 'il': 'Israel', 'ch': 'Switzerland', 'rs': 'Serbia', 'ng': 'Nigeria', 've': 'Venezuela, Bolivarian Republic of', 'si': 'Slovenia', 'lv': 'Latvia', 'jp': 'Japan', 'ph': 'Philippines', 'hu': 'Hungary', 'it': 'Italy', 'cz': 'Czechia', 'bg': 'Bulgaria', 'mx': 'Mexico', 'gr': 'Greece', 'co': 'Colombia', 'gb': 'United Kingdom', 'cu': 'Cuba', 'in': 'India', 'sa': 'Saudi Arabia', 'hk': 'Hong Kong', 'sk': 'Slovakia', 'kr': 'Korea, Republic of', 'id': 'Indonesia', 'au': 'Australia', 'fr': 'France', 'ca': 'Canada', 'ua': 'Ukraine', 'th': 'Thailand', 'ru': 'Russian Federation', 'pl': 'Poland', 'tr': 'Turkey', 'pt': 'Portugal', 'be': 'Belgium', 'eg': 'Egypt', 'ae': 'United Arab Emirates', 'no': 'Norway', 'za': 'South Africa', 'br': 'Brazil', 'at': 'Austria', 'ro': 'Romania', 'ar': 'Argentina', 'de': 'Germany', 'tw': 'Taiwan, Province of China', 'nz': 'Ne

In [43]:
country_names['tr']

'Turkey'