In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
from tabulate import tabulate
import random
import pycountry
# import networkx as nx
from dateutil import parser
import json
from datetime import datetime
from collections import Counter
# from langdetect import detect
from omnibelt import load_json, save_json

In [2]:
localroot = Path().absolute().parent
localroot

PosixPath('/home/fleeb/workspace/code/nnn')

In [3]:
root = Path('/home/fleeb/workspace/local_data/nnn')
print(list(p.name for p in root.glob('*')))

['raw_news', 'ftlang.json', 'full.json', 'translated', 'prompts', 'langdetect.json', 'dense', 'temp', 'global-news-headlines', 'global-news-headlines.zip', 'assets']


In [4]:
recs = (root/'global-news-headlines').glob('**/*.json')
recs = list(recs)
len(recs)

54

In [5]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
by_IDs = {art['ID']: art for art in articles}
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [102]:
print(tabulate([art['ID'], ', '.join(i['location'] for i in art['instances']), art['content']] for art in random.choices(articles, k=20)))

-------  ----------------------  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 359322  ph, ph                  Gordon Murrays next project will be a little bit more conventional.
                                 Speaking to Top Gear on the reveal of the T.50s racerthe track-only version of the T.50 supercarMurray revealed that whatever he d… [+2440 chars]
3548224  ar, ar                  Este miércoles, Marcelo Tinelli dio por concluida la quinta ronda de La Academia y reveló el voto secreto de Ángel De Brito para las 19 parejas de La Academia en la ronda de samba de ballroom. Así, c… [+1948 chars]
  21525  hk, tw, hk, tw, hk, tw  Anthony Davis 103-101 LeBron James Davis 34 8 4 …
                                 Davis Kawhi Leonard Davis 17 DavisAD Marcus Morris
                                  Davis
                 

In [23]:
trpaths = (root/'translated').glob('**/*.jsonl')
trpaths = list(trpaths)
len(trpaths)

41

In [24]:
def load_jsonl(path):
	return [json.loads(line) for line in path.read_text().split('\n') if len(line)]
def to_ids(records):
	ids, errs = {}, set()
	for record in records:
		payload = record['title'], record.get('description')
		for ID in record.get('ids', [record.get('ID')]):
			if ID in ids:
				errs.add(ID)
			ids[ID] = payload
	return ids, errs

In [76]:
trrecs = [to_ids(load_jsonl(trpath))[0] for trpath in tqdm(trpaths)]
trsids = {}
for tr in trrecs:
	assert not set(tr) & set(trsids)
	trsids.update(tr)
len(trsids)

3590966

In [77]:
trs = [by_IDs[ID] for ID in trsids]
nontrs = [art for art in articles if art['ID'] not in trsids]
len(nontrs)

1128233

In [27]:
langdet = {int(i): v for i, v in load_json(root/'langdetect.json').items()}
len(langdet)

4719199

In [28]:
missing_langs = {art['ID']: langdet[art['ID']] for art in nontrs}
len(missing_langs)

1128233

In [29]:
print(tabulate(Counter(missing_langs.values()).most_common(), headers=['lang', 'count']))

lang      count
------  -------
en      1125849
de          892
nl          227
ca          213
it          197
fr          158
af          112
id           98
no           74
tl           68
es           62
da           52
sv           50
pt           42
ro           41
et           34
sl           20
hr            6
so            6
fi            5
cs            5
vi            4
cy            4
pl            3
sw            3
ar            2
hi            2
hu            1
lt            1
tr            1
lv            1


In [48]:
arts = [by_IDs[ID] for ID, ld in missing_langs.items() if ld == 'hr']
print(tabulate([(art['ID'], art['title'], art['description']) for art in random.choices(arts,k=20)], headers=['ID', 'title', 'description']))

     ID  title                                                                                                         description
-------  ------------------------------------------------------------------------------------------------------------  -------------
1685740  Loga in pre-match juju drama - The Herald
2131395  Japan's Takeda seeks govt approval for Moderna COVID-19 vaccine - Nasdaq
3187441  Novak Djokovic Congratulates Nikola Jokic On NBA MVP - ATP Tour
1685740  Loga in pre-match juju drama - The Herald
2131395  Japan's Takeda seeks govt approval for Moderna COVID-19 vaccine - Nasdaq
3187441  Novak Djokovic Congratulates Nikola Jokic On NBA MVP - ATP Tour
2912522  Mavs PostGame Interview: Luka Dončić (05/14/21) - Dallas Mavericks
2131395  Japan's Takeda seeks govt approval for Moderna COVID-19 vaccine - Nasdaq
2912522  Mavs PostGame Interview: Luka Dončić (05/14/21) - Dallas Mavericks
1033301  Yakuza: Like A Dragon Review - GameSpot
2912522  Mavs PostGame Interview: Luka Do

In [41]:
countries = {'gb': 'United Kingdom', 'ar': 'Argentina', 'pl': 'Poland', 'sk': 'Slovakia', 'us': 'United States', 'eg': 'Egypt', 'no': 'Norway', 'ph': 'Philippines', 'at': 'Austria', 'rs': 'Serbia', 'tw': 'Taiwan', 'be': 'Belgium', 'cu': 'Cuba', 'sa': 'Saudi Arabia', 'th': 'Thailand', 'id': 'Indonesia', 'ru': 'Russian Federation', 'ch': 'Switzerland', 'fr': 'France', 'lt': 'Lithuania', 'tr': 'Turkey', 'de': 'Germany', 'cz': 'Czechia', 'pt': 'Portugal', 'ae': 'United Arab Emirates', 'it': 'Italy', 'cn': 'China', 'lv': 'Latvia', 'nl': 'Netherlands', 'hk': 'Hong Kong', 'ca': 'Canada', 'br': 'Brazil', 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'au': 'Australia', 'my': 'Malaysia', 'ie': 'Ireland', 'ua': 'Ukraine', 'in': 'India', 'ma': 'Morocco', 'bg': 'Bulgaria', 'ng': 'Nigeria', 'il': 'Israel', 'se': 'Sweden', 'za': 'South Africa', 've': 'Venezuela', 'nz': 'New Zealand', 'jp': 'Japan', 'sg': 'Singapore', 'gr': 'Greece', 'mx': 'Mexico', 'co': 'Colombia', 'ro': 'Romania'}

In [45]:
done = [path.stem.split('_')[-1] for path in (root/'translated').glob('**/*.jsonl')]
notdone = [c for c in countries if c not in done]
print(sorted(notdone))

['au', 'ca', 'gb', 'ie', 'in', 'my', 'ng', 'nz', 'ph', 'sa', 'sg', 'us', 'za']


In [55]:
import unicodedata

def is_latin(s):
	return all(not char.isalpha() or unicodedata.name(char, '').startswith('LATIN') for char in s)

In [82]:
# sus = [art for art in tqdm(nontrs) if not is_latin(art['title'])]
sus = [art for art in tqdm(trs) if not is_latin(trsids[art['ID']][0])]
len(sus)

  0%|          | 0/3590966 [00:00<?, ?it/s]

5431

In [93]:
# print(tabulate([art['ID'], ', '.join(i['location'] for i in art['instances']), art['title']] for art in random.choices(sus, k=20)))
print(tabulate([art['ID'], ', '.join(i['location'] for i in art['instances']), trsids[art['ID']][0]] for art in random.choices(sus, k=20)))

-------  ----------  ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2582963  hk          Collection of Frank Confessions: The Situation in Ukraine｜Comments｜Hong Kong and Macao｜on.cc东网- on.cc东网
 537264  tw, tw, tw  Sleepless for 5 days, carrying everyone’s sorrow! Brother Xian’s haggard appearance was exposed in private @东sen News CH51 - Dongsen News CH51
1396279  hk, hk      Star Making III丨Judas’ performance was inaccurate and missed the top 10. ANSONBEAN’s charming smile won the hearts of the people｜Apple Daily-Hong Kong Apple Daily
3219185  bg          They discovered signs of ancient life on Mars ᐉ News from Fakti.bg - Curious | FACTS.BG - Facts.BG
4215757  hk          Homesickness War丨Wang Juntang's handsome boy Dongdong was praised for being handsome and calm at 190cm and developing into the modeling industry

In [63]:
art = sus[0]
art = random.choice(sus)
print(art['title'])

China's Mars rover accomplishes planned exploration tasks_英语频道_央视网(cctv.com) - CCTV


In [61]:
print(tabulate([(c, not c.isalpha(), unicodedata.name(c, '')) for c in art['title']]))

-  -----  -----------------------------------
ت  False  ARABIC LETTER TEH
ط  False  ARABIC LETTER TAH
و  False  ARABIC LETTER WAW
ر  False  ARABIC LETTER REH
   True   SPACE
خ  False  ARABIC LETTER KHAH
ط  False  ARABIC LETTER TAH
ي  False  ARABIC LETTER YEH
ر  False  ARABIC LETTER REH
.  True   FULL STOP
.  True   FULL STOP
   True   SPACE
ط  False  ARABIC LETTER TAH
ا  False  ARABIC LETTER ALEF
ئ  False  ARABIC LETTER YEH WITH HAMZA ABOVE
ر  False  ARABIC LETTER REH
ة  False  ARABIC LETTER TEH MARBUTA
   True   SPACE
إ  False  ARABIC LETTER ALEF WITH HAMZA BELOW
ث  False  ARABIC LETTER THEH
ي  False  ARABIC LETTER YEH
و  False  ARABIC LETTER WAW
ب  False  ARABIC LETTER BEH
ي  False  ARABIC LETTER YEH
ة  False  ARABIC LETTER TEH MARBUTA
   True   SPACE
ت  False  ARABIC LETTER TEH
خ  False  ARABIC LETTER KHAH
ت  False  ARABIC LETTER TEH
ر  False  ARABIC LETTER REH
ق  False  ARABIC LETTER QAF
   True   SPACE
ح  False  ARABIC LETTER HAH
د  False  ARABIC LETTER DAL
و  False  ARABIC LETTER

In [94]:
len(trsids)

3590966

In [95]:
# for rec in tqdm(recs):
# 	arts = load_json(rec)
# 	
# 	for art in arts:
# 		if art['ID'] in trsids:
# 			art['english-title'], art['english-description'] = trsids[art['ID']]
# 	
# 	save_json(arts, rec)

  0%|          | 0/54 [00:00<?, ?it/s]

In [97]:
check = load_json(recs[8])