In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import json
from tabulate import tabulate
import random
# import pycountry
from iso639 import languages
import networkx as nx
from dateutil import parser
from datetime import datetime
from collections import Counter
# from langdetect import detect
from omnibelt import load_json, save_json
# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
# from transformers import pipeline

In [405]:
country_codes = {'gb': 'United Kingdom', 'ar': 'Argentina', 'pl': 'Poland', 'sk': 'Slovakia', 'us': 'United States', 'eg': 'Egypt', 'no': 'Norway', 'ph': 'Philippines', 'at': 'Austria', 'rs': 'Serbia', 'tw': 'Taiwan', 'be': 'Belgium', 'cu': 'Cuba', 'sa': 'Saudi Arabia', 'th': 'Thailand', 'id': 'Indonesia', 'ru': 'Russian Federation', 'ch': 'Switzerland', 'fr': 'France', 'lt': 'Lithuania', 'tr': 'Turkey', 'de': 'Germany', 'cz': 'Czechia', 'pt': 'Portugal', 'ae': 'United Arab Emirates', 'it': 'Italy', 'cn': 'China', 'lv': 'Latvia', 'nl': 'Netherlands', 'hk': 'Hong Kong', 'ca': 'Canada', 'br': 'Brazil', 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'au': 'Australia', 'my': 'Malaysia', 'ie': 'Ireland', 'ua': 'Ukraine', 'in': 'India', 'ma': 'Morocco', 'bg': 'Bulgaria', 'ng': 'Nigeria', 'il': 'Israel', 'se': 'Sweden', 'za': 'South Africa', 've': 'Venezuela', 'nz': 'New Zealand', 'jp': 'Japan', 'sg': 'Singapore', 'gr': 'Greece', 'mx': 'Mexico', 'co': 'Colombia', 'ro': 'Romania'}
eng_country = {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}
# print(' '.join(sorted(c for c in country_codes if c not in eng_country)))
print(' '.join(sorted(eng_country)))

au ca gb ie in my ng nz ph sa sg us za


In [454]:
loc = 'za'

In [455]:
root = Path(r'C:\Users\anwan\workspace\local_data\nnn-manual')
dstroot = root / 'nnn-v1'
dstroot.mkdir(exist_ok=True)
recs = (root/'global-news-headlines-zip').glob(f'**/*{loc}.json')
recs = list(recs)
len(recs)

1

In [456]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
by_ID = {art['ID']: art for art in articles}
len(articles)

  0%|          | 0/1 [00:00<?, ?it/s]

63222

In [400]:
trpath = root/'translated'/f'completed_{loc}.jsonl'
trfull = [json.loads(line) for line in tqdm(trpath.read_text().split('\n'))]
trs = {ID: {'title': tr['title'], 'description': tr['description']} for tr in trfull for ID in tr['ids']}
len(trfull), len(trs)

  0%|          | 0/46811 [00:00<?, ?it/s]

(46811, 47374)

In [401]:
cnpath = root / 'completed' / f'completed_{loc}.jsonl'
cnfull = [json.loads(line) for line in tqdm(cnpath.read_text().split('\n'))]
_contents = {ID: tr['result'] for tr in cnfull for ID in json.loads(tr['id'])}
issues = []
for ID, tr in trs.items():
	if ID in _contents:
		tr['content'] = _contents[ID]
	else:
		issues.append(ID)
len(cnfull), len(_contents)

  0%|          | 0/41310 [00:00<?, ?it/s]

(41310, 47374)

In [402]:
missing = set(by_ID).symmetric_difference(set(trs))
len(missing)

0

In [403]:
for art in tqdm(articles):
	ID = art['ID']
	trans = trs.get(ID, {})
	for key in ['title', 'description', 'content']:
		if key in trans and trans[key] is not None and len(trans[key]) and art[key] != trans[key]:
			art[f'en-{key}'] = trans[key]

  0%|          | 0/47374 [00:00<?, ?it/s]

In [457]:
dst = dstroot / f'nnn-v1-{loc}.json'
# dst.write_text(json.dumps(articles), encoding='utf8');
print(dst.name)

nnn-v1-za.json


In [503]:
clusters = {
	'en': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'es': {'ar', 've', 'co', 'cu', 'mx'}, 
	'ar': {'ae', 'eg'}, 
	'de': {'at', 'de', 'ch'}, 
	'zh': {'tw', 'hk', 'cn'}, 
	'fr': {'be', 'fr', 'ma'}, 
	'pt': {'pt', 'br'},
	'bg': {'bg'},
	'cs': {'cz'},
	'tr': {'tr'},
	'th': {'th'},
	'ua': {'ua'},
	'el': {'gr'},
	'he': {'il'},
	'ja': {'jp'},
	'sr': {'rs'},
}


In [506]:
known = {c for cs in clusters.values() for c in cs}
missing = {c for c in country_codes if c not in known}
print(sorted(missing))

[]


In [505]:
for m in missing:
	try:
		languages.get(part1=m).name
	except:
		pass
	else:
		clusters[m] = {m}

In [498]:
languages.get(part1='el').name
# languages.get(name='Czech').part1

'Modern Greek (1453-)'

In [508]:
print(clusters)

{'en': {'ie', 'my', 'gb', 'ca', 'za', 'ng', 'us', 'ph', 'in', 'sa', 'sg', 'au', 'nz'}, 'es': {'ar', 'co', 'mx', 'cu', 've'}, 'ar': {'eg', 'ae'}, 'de': {'at', 'de', 'ch'}, 'zh': {'tw', 'cn', 'hk'}, 'fr': {'ma', 'fr', 'be'}, 'pt': {'br', 'pt'}, 'bg': {'bg'}, 'cs': {'cz'}, 'tr': {'tr'}, 'th': {'th'}, 'ua': {'ua'}, 'el': {'gr'}, 'he': {'il'}, 'ja': {'jp'}, 'sr': {'rs'}, 'it': {'it'}, 'ru': {'ru'}, 'no': {'no'}, 'si': {'si'}, 'kr': {'kr'}, 'se': {'se'}, 'sk': {'sk'}, 'hu': {'hu'}, 'ro': {'ro'}, 'nl': {'nl'}, 'id': {'id'}, 'pl': {'pl'}, 'lv': {'lv'}, 'lt': {'lt'}}


In [510]:
langs = {l:sorted(v) for l, v in sorted(clusters.items(), key=lambda i: (-len(i[1]), i[0]))}
print(langs)

{'en': ['au', 'ca', 'gb', 'ie', 'in', 'my', 'ng', 'nz', 'ph', 'sa', 'sg', 'us', 'za'], 'es': ['ar', 'co', 'cu', 'mx', 've'], 'de': ['at', 'ch', 'de'], 'fr': ['be', 'fr', 'ma'], 'zh': ['cn', 'hk', 'tw'], 'ar': ['ae', 'eg'], 'pt': ['br', 'pt'], 'bg': ['bg'], 'cs': ['cz'], 'el': ['gr'], 'he': ['il'], 'hu': ['hu'], 'id': ['id'], 'it': ['it'], 'ja': ['jp'], 'kr': ['kr'], 'lt': ['lt'], 'lv': ['lv'], 'nl': ['nl'], 'no': ['no'], 'pl': ['pl'], 'ro': ['ro'], 'ru': ['ru'], 'se': ['se'], 'si': ['si'], 'sk': ['sk'], 'sr': ['rs'], 'th': ['th'], 'tr': ['tr'], 'ua': ['ua']}
