In [2]:
from pathlib import Path
from multiprocessing import Pool
from tqdm.notebook import tqdm
import numpy as np
from omnibelt import load_json, save_json
from nltk.corpus import stopwords
from itertools import combinations, islice
from collections import Counter
import unicodedata
import pandas as pd

In [3]:
lang_clusters = {'en': ['au', 'ca', 'gb', 'ie', 'in', 'my', 'ng', 'nz', 'ph', 'sa', 'sg', 'us', 'za'],
				 'es': ['ar', 'co', 'cu', 'mx', 've'], 'de': ['at', 'ch', 'de'], 'fr': ['be', 'fr', 'ma'],
				 'zh': ['cn', 'hk', 'tw'], 'ar': ['ae', 'eg'], 'pt': ['br', 'pt'], 'bg': ['bg'], 'cs': ['cz'],
				 'el': ['gr'], 'he': ['il'], 'hu': ['hu'], 'id': ['id'], 'it': ['it'], 'ja': ['jp'], 'ko': ['kr'],
				 'lt': ['lt'], 'lv': ['lv'], 'nl': ['nl'], 'no': ['no'], 'pl': ['pl'], 'ro': ['ro'], 'ru': ['ru'],
				 'sv': ['se'], 'sl': ['si'], 'sk': ['sk'], 'sr': ['rs'], 'th': ['th'], 'tr': ['tr'], 'uk': ['ua']}
loc_names = {'gb': 'United Kingdom', 'ar': 'Argentina', 'pl': 'Poland', 'sk': 'Slovakia', 'us': 'United States',
			 'eg': 'Egypt', 'no': 'Norway', 'ph': 'Philippines', 'at': 'Austria', 'rs': 'Serbia', 'tw': 'Taiwan',
			 'be': 'Belgium', 'cu': 'Cuba', 'sa': 'Saudi Arabia', 'th': 'Thailand', 'id': 'Indonesia',
			 'ru': 'Russian Federation', 'ch': 'Switzerland', 'fr': 'France', 'lt': 'Lithuania', 'tr': 'Turkey',
			 'de': 'Germany', 'cz': 'Czechia', 'pt': 'Portugal', 'ae': 'United Arab Emirates', 'it': 'Italy',
			 'cn': 'China', 'lv': 'Latvia', 'nl': 'Netherlands', 'hk': 'Hong Kong', 'ca': 'Canada', 'br': 'Brazil',
			 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'au': 'Australia', 'my': 'Malaysia', 'ie': 'Ireland',
			 'ua': 'Ukraine', 'in': 'India', 'ma': 'Morocco', 'bg': 'Bulgaria', 'ng': 'Nigeria', 'il': 'Israel',
			 'se': 'Sweden', 'za': 'South Africa', 've': 'Venezuela', 'nz': 'New Zealand', 'jp': 'Japan',
			 'sg': 'Singapore', 'gr': 'Greece', 'mx': 'Mexico', 'co': 'Colombia', 'ro': 'Romania'}
lang_names = {'en': 'English', 'ko': 'Korean', 'ru': 'Russian', 'es': 'Spanish', 'pt': 'Portuguese', 'cs': 'Czech',
			  'tr': 'Turkish', 'nl': 'Dutch', 'ar': 'Arabic', 'fr': 'French', 'bg': 'Bulgarian', 'id': 'Indonesian',
			  'sk': 'Slovak', 'el': 'Greek', 'he': 'Hebrew', 'sr': 'Serbian', 'hu': 'Hungarian', 'th': 'Thai',
			  'zh': 'Chinese', 'no': 'Norwegian', 'sl': 'Slovenian', 'sv': 'Swedish', 'de': 'German', 'lv': 'Latvian',
			  'pl': 'Polish', 'it': 'Italian', 'ro': 'Romanian', 'lt': 'Lithuanian', 'ja': 'Japanese',
			  'uk': 'Ukrainian'}
cluster_id = {loc: lang for lang, locs in lang_clusters.items() for loc in locs}
len(cluster_id)

54

In [4]:
def topk_ngrams(importances, ordered_keywords, n=2, k=10):
	assert len(ordered_keywords) >= n, f'Not enough tokens in bag: {len(ordered_keywords)} < {n}: {ordered_keywords}'
	return [frozenset(ws) for ws in islice(
		sorted(combinations(ordered_keywords, n), key=lambda ws: sum(importances[w] for w in ws), reverse=True), k)]
# def ngram_shells(importances, keywords, num_shells=3, k=10):
# 	most = min(num_shells, len(keywords))
# 	content = [set(topk_ngrams(importances, keywords, n=i, k=k)) for i in range(1, most + 1)]
# 	for _ in range(most, num_shells):
# 		content.append(set())
# 	return content
def has_hits(art_keywords, aids, segs):
	return any(all(w in art_keywords[aid] for w in seg) for aid in aids for seg in segs)
def find_hits(art_keywords, aids, segs):
	hits = {}
	for aid in aids:
		for seg in segs:
			if all(w in art_keywords[aid] for w in seg):
				hits.setdefault(seg, []).append(aid)
	return hits
def segs2keywords(importances, segs):
	return sorted(set(w for seg in segs for w in seg), key=lambda w: importances[w], reverse=True)
def generate_candidates(art_keywords, importances, tier, kws, members=None, num_kw=10, num_member=10):
	cands = set()
	if len(kws) >= tier:
		cands.update(topk_ngrams(importances, kws, n=tier, k=num_kw))
	if members and len(members) and num_member:
		member_cands = Counter()
		for aid in members:
			member_cands.update(topk_ngrams(importances, art_keywords[aid], n=tier, k=num_kw))
		for seg, _ in member_cands.most_common(num_member):
			cands.add(seg)
	return cands
def mainline_cluster_tiers(art_keywords, importances, center, options, starting_tier=7, min_tier=2, num_kw=10, num_member=10):
	kws = art_keywords[center]
	members = set()

	known_tokens = set()
	tokens = {}
	tiers = {}
	trace = Counter()
	remaining = list(options)
	if center in remaining:
		remaining.remove(center)
	assert min_tier >= 1, f'Minimum tier must be at least 1: {min_tier}'
	for tier in range(starting_tier, min_tier - 1, -1):
		if not len(remaining):
			break

		cands = generate_candidates(art_keywords, importances, tier, kws, members=members, num_kw=num_kw, num_member=num_member)
		# print(tier, len(cands))
		trace[tier] = len(cands)
		if not has_hits(art_keywords, remaining, cands):
			continue

		hits = find_hits(art_keywords, remaining, cands)

		gold = {}
		for seg, aids in hits.items():
			for aid in aids:
				gold.setdefault(aid, set()).update(seg)
		tokens[tier] = Counter(w for seq in gold.values() for w in seq if w not in known_tokens)
		kws = segs2keywords(importances, hits.keys())
		known_tokens.update(kws)
		tiers[tier] = set(aid for seg, aids in hits.items() for aid in aids)
		members.update(tiers[tier])
		remaining = [aid for aid in remaining if aid not in members]

	return tokens, tiers, trace

stop_words = set(stopwords.words('english'))
stop_words.update("'s", '|', 'I', "n't", "`s", "'s", 'n`t')
_my_stop_words = {"'s", 'news'}


def is_good_word(w):
	return any(not unicodedata.category(char).startswith('P') for char in
			   w) and w not in stop_words and w not in _my_stop_words

In [5]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root / 'babel-briefings-v1').glob('**/*.json')
recs = list(recs)
len(recs)
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [6]:
by_loc = {}
by_lang = {}
by_source = {}
by_ID = {}
for article in tqdm(articles):
	by_ID[article['ID']] = article
	# article['published'] = parser.parse(article['publishedAt'])
	by_source.setdefault(article['source-name'], []).append(article)
	by_lang.setdefault(article['language'], []).append(article)
	for instance in article['instances']:
		# instance['collected'] = parser.parse(instance['collectedAt'])
		by_loc.setdefault(instance['location'], []).append(article)
len(by_loc), len(by_lang)

  0%|          | 0/4719199 [00:00<?, ?it/s]

(54, 30)

In [7]:
cluster_root = root / 'clusterings'
cluster_root.mkdir(exist_ok=True)

In [8]:
lowercase_all = True
full_bagowords_inds = {int(ID): {k.lower() if lowercase_all else k:v for k,v in bag.items()} 
					   for ID, bag in tqdm(load_json(root/'bagowords-ordered-full.json').items())}

pre_dash = Counter()
for ID, bagi in tqdm(full_bagowords_inds.items()):
	if '-' in bagi:
		pre_dash.update({w: len(inds) for w, inds in bagi.items() if all(i < bagi["-"][-1] for i in inds)})

post_dash = Counter()
for ID, bagi in tqdm(full_bagowords_inds.items()):
	if '-' in bagi:
		post_dash.update({w: len(inds) for w, inds in bagi.items() if w not in pre_dash})
all_bags_inds = {ID: {w: [i for i in inds if bag.get('-', [float('inf')])[-1] > i]
					  for w, inds in bag.items() if w not in post_dash and is_good_word(w)}
				 for ID, bag in tqdm(full_bagowords_inds.items())}
all_bags = {ID: Counter({w: len(inds) for w, inds in bag.items() if len(inds)}) for ID, bag in all_bags_inds.items()}
len(all_bags)

  0%|          | 0/4719199 [00:00<?, ?it/s]

  0%|          | 0/4719199 [00:00<?, ?it/s]

  0%|          | 0/4719199 [00:00<?, ?it/s]

  0%|          | 0/4719199 [00:00<?, ?it/s]

4719199

In [9]:
df = pd.DataFrame(
	[{**inst, 'aid': art['ID']} for art in tqdm(articles) for i, inst in
	 enumerate(by_ID[art['ID']]['instances'])])
df['collectedAt'] = pd.to_datetime(df['collectedAt'])
df = df.sort_values('collectedAt')
df['date'] = df['collectedAt'].dt.date

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [10]:
daybags = {}
date_aids = dict(df.groupby('date')['aid'].apply(set))
for date, aIDs in tqdm(sorted(date_aids.items())):
	total = Counter()
	for aID in aIDs:
		total.update(all_bags[aID])
	daybags[date] = total
len(daybags)

  0%|          | 0/413 [00:00<?, ?it/s]

413

In [11]:
dayidf = Counter()
for date, bag in tqdm(daybags.items()):
	dayidf.update(bag.keys())
dayidf = {w: np.log(len(daybags) / f)  for w, f in dayidf.items()}
daytotals = {d: sum(bag.values()) for d, bag in daybags.items()}
daytfidf = {day: Counter({w: f / daytotals[day] * dayidf[w] for w, f in bag.items()}) for day, bag in tqdm(daybags.items())}

  0%|          | 0/413 [00:00<?, ?it/s]

  0%|          | 0/413 [00:00<?, ?it/s]

In [16]:

# def w():
def worker_fn(dateidx, target, today_aids, today_tfidf, today_bags):
	
	datstr = target.strftime('%Y-%m-%d')
	cls_path = Path('/home/fleeb/workspace/local_data/nnn') / 'clusterings' / f'clusters_{datstr}.json'
	
	if cls_path.exists():
		print(f'{dateidx+1}/{413} - Skipping {datstr} - already exists')
		return
	
	# today_aids = date_aids[target]
	# today_bag = daybags[target]
	# today_df = df[df['date'] == target]
	# today_tfidf = token_importances
	importances = today_tfidf

	treat_bags_as_sets = True
	prior_art_scores = Counter({aID: sum(
		importances[w] * (1 if treat_bags_as_sets else f) for w, f in today_bags[aID].items() if w in importances) for aID
								in today_aids})

	art_imps = list(prior_art_scores.most_common())
	aidorder = np.array([c for c, _ in art_imps])

	art_keywords = {
		aid: [w for w in sorted(today_bags[aid], key=lambda w: importances[w], reverse=True) if importances[w] > 1e-8] for
		aid in aidorder}

	full_clusters = {}
	todo = aidorder.tolist()
	total = len(todo)
	# itr = tqdm(total=total)
	# itr.set_description(f'{dateidx + 1}/{len(daybags)} - {datstr}')
	
	print(f'{dateidx+1}/{413} - Starting {datstr} ({total} articles)')
	
	while len(todo):
		center = todo[0]
		tokens, tiers, trace = mainline_cluster_tiers(art_keywords, importances, center, todo, starting_tier=7, min_tier=3, num_kw=200,
													  num_member=200)
		prev = len(todo)
		for aids in tiers.values():
			todo = [aid for aid in todo if aid not in aids]
		if center in todo:
			todo.remove(center)
		# itr.update(prev - len(todo))
		full_clusters[center] = {'tokens': tokens, 'tiers': tiers, 'trace': trace}

	# itr.close()

	save_json({center: {'tokens': info['tokens'], 'tiers': {t: list(aids) for t, aids in info['tiers'].items()},
						'trace': info['trace']} for center, info in full_clusters.items()}, cls_path)
	
	print(f'{dateidx+1}/{413} - Finished {datstr} ({total} articles) - {len(full_clusters)} clusters')

In [17]:
with Pool(8) as pool:
	pool.starmap(worker_fn, [(i, target, date_aids[target], daytfidf[target], {aid: all_bags[aid] for aid in date_aids[target]}) for i, target in enumerate(sorted(daybags))])


Starting 2020-08-07 (3998 articles)
Starting 2020-08-20 (15628 articles)
Starting 2020-09-02 (15421 articles)
Starting 2020-09-15 (15000 articles)
Starting 2020-09-28 (14657 articles)
Starting 2020-10-11 (14974 articles)
Starting 2020-10-24 (15657 articles)
Starting 2020-11-06 (15746 articles)
Finished 2020-08-07 (3998 articles) - 3522 clusters
Starting 2020-08-08 (15211 articles)
Finished 2020-09-28 (14657 articles) - 11911 clusters
Starting 2020-09-29 (14845 articles)
Finished 2020-09-15 (15000 articles) - 12334 clusters
Starting 2020-09-16 (15576 articles)
Finished 2020-10-11 (14974 articles) - 12370 clusters
Starting 2020-10-12 (14474 articles)
Finished 2020-09-02 (15421 articles) - 12552 clusters
Starting 2020-09-03 (14959 articles)
Finished 2020-08-20 (15628 articles) - 12810 clusters
Starting 2020-08-21 (15599 articles)
Finished 2020-10-24 (15657 articles) - 12876 clusters
Starting 2020-10-25 (14860 articles)
Finished 2020-08-08 (15211 articles) - 12488 clusters
Starting 2020-08

Process ForkPoolWorker-17:
Process ForkPoolWorker-20:
Process ForkPoolWorker-15:
Process ForkPoolWorker-13:
Process ForkPoolWorker-19:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/fleeb/miniconda3/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar
    return list(iterto

KeyboardInterrupt: 

In [None]:

# for dateidx, target in enumerate(daybags):
# 
# 	today_aids = date_aids[target]
# 	today_bag = daybags[target]
# 	# today_df = df[df['date'] == target]
# 	today_tfidf = daytfidf[target]
# 	importances = today_tfidf
# 
# 	treat_bags_as_sets = True
# 	prior_art_scores = Counter({aID: sum(
# 		importances[w] * (1 if treat_bags_as_sets else f) for w, f in all_bags[aID].items() if w in importances) for aID
# 								in today_aids})
# 
# 
# 	def article_affinity(aid1, aid2):
# 		bag1, bag2 = all_bags[aid1], all_bags[aid2]
# 		return sum(importances[w] * (1 if treat_bags_as_sets else min(f, bag2[w])) for w, f in bag1.items() if
# 				   w in bag2) / np.sqrt(prior_art_scores[aid1] * prior_art_scores[aid2])
# 
# 
# 	art_imps = list(prior_art_scores.most_common())
# 	aidorder = np.array([c for c, _ in art_imps])
# 
# 	art_keywords = {
# 		aid: [w for w in sorted(all_bags[aid], key=lambda w: importances[w], reverse=True) if importances[w] > 1e-8] for
# 		aid in aidorder}
# 
# 	datstr = target.strftime('%Y-%m-%d')
# 
# 	full_clusters = {}
# 	todo = aidorder.tolist()
# 	total = len(todo)
# 	itr = tqdm(total=total)
# 	itr.set_description(f'{dateidx + 1}/{len(daybags)} - {datstr}')
# 	while len(todo):
# 		center = todo[0]
# 		tokens, tiers, trace = mainline_cluster_tiers(center, todo, starting_tier=7, min_tier=3, num_kw=200,
# 													  num_member=200)
# 		prev = len(todo)
# 		for aids in tiers.values():
# 			todo = [aid for aid in todo if aid not in aids]
# 		if center in todo:
# 			todo.remove(center)
# 		itr.update(prev - len(todo))
# 		full_clusters[center] = {'tokens': tokens, 'tiers': tiers, 'trace': trace}
# 
# 	itr.close()
# 
# 	save_json({center: {'tokens': info['tokens'], 'tiers': {t: list(aids) for t, aids in info['tiers'].items()},
# 						'trace': info['trace']} for center, info in full_clusters.items()},
# 			  cluster_root / f'clusters_{datstr}.json')