In [149]:
from pathlib import Path
import random
import re, regex
import pycountry
from pycld2 import detect
# from iso639 import languages
from collections import Counter
import tldextract
from iso639 import languages
from tqdm.notebook import tqdm
from tabulate import tabulate
from dateutil import parser
from datetime import datetime
from omnibelt import load_json, save_json

In [16]:
def show_date(date):
	return date.strftime('%d %b%y')
def get_locs(article):
	return [f'{country_names[loc]}' for loc in sorted(set(i['location'] for i in article['instances']))]
def get_cats(article):
	return [f'<{cat}>' for cat in sorted(set(i['category'] for i in article['instances']))]
def view_article(art, detailed=False):
	cats = ' '.join(get_cats(art))
	locs = ', '.join(map(repr,get_locs(art)))
	if 'published' not in art:
		art['published'] = parser.parse(art['publishedAt'])
	for i in art['instances']:
		if 'collected' not in i:
			i['collected'] = parser.parse(i['collectedAt'])
	first = min(i['collected'] for i in art['instances'])
	last = max(i['collected'] for i in art['instances'])
	timing = f'{show_date(first)}' if first==last else f'{show_date(first)} - {show_date(last)}'
	print(f'''Title: {art['title']!r}
Categories: {cats}   ---   {locs} ({timing})
Published: {show_date(art['published'])}'''
	) # ({art['source-name']})
	if detailed:
		print(f'''Description: {art['description']}''')
		# print(f'''Content: {art['content']}''')

In [32]:
RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
def remove_emoji(text):
	return emoji_pattern.sub(r'', text)
def remove_bad_chars(text):
	text = remove_emoji(text)
	return RE_BAD_CHARS.sub("", text).replace('\x93', '"').replace('\x94', '"').replace('\ufdef', '').replace('\ufdeb', '').replace('\U0003af61', '').replace('\ufde2', '').replace('\u05fc', '').replace('\U000e3b3c', '').replace('\u05f6', '')
def get_lang_query(art):
	terms = [art['title']]
	if art['description']:
		terms.append(art['description'])
	# if art['content']:
	# 	terms.append(art['content'])
	query = remove_bad_chars('\n'.join(terms))
	return query
def parse_url(art):
	return tldextract.extract(art['url'])
# remove_bad_chars("A\x96 bad char")  # Cc category

In [2]:
def parse_date(date: str, time: str):
	# date = parser.parse(date)
	# time = parser.parse(time)
	date = datetime.strptime(date, '%y-%m-%d')
	time = datetime.strptime(time, '%H-%M-%S')
	return datetime(date.year, date.month, date.day, time.hour, time.minute)

In [5]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root / 'global-news-headlines').glob('**/*.json')
recs = list(recs)
len(recs)

54

In [7]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [8]:
by_ID = {}
by_loc = {}
for article in tqdm(articles):
	by_ID[article['ID']] = article
	for instance in article['instances']:
		by_loc.setdefault(instance['location'], []).append(article)
country_names = {code: pycountry.countries.get(alpha_2=code.upper()).name.split(',')[0] for code in by_loc}
clusters = {
	'english': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'spanish': {'ar', 've', 'co', 'cu', 'mx'}, 
	'arabic': {'ae', 'eg'}, 
	'german': {'at', 'de', 'ch'}, 
	'chinese': {'tw', 'hk', 'cn'}, 
	'french': {'be', 'fr', 'ma'}, 
	'portuguese': {'pt', 'br'},
}

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [246]:
print(country_names)

{'gb': 'United Kingdom', 'ca': 'Canada', 'za': 'South Africa', 'sa': 'Saudi Arabia', 'us': 'United States', 'ie': 'Ireland', 'my': 'Malaysia', 'nz': 'New Zealand', 'sg': 'Singapore', 'ng': 'Nigeria', 'ph': 'Philippines', 'au': 'Australia', 'in': 'India', 'ar': 'Argentina', 'mx': 'Mexico', 'co': 'Colombia', 'cu': 'Cuba', 've': 'Venezuela', 'pl': 'Poland', 'sk': 'Slovakia', 'eg': 'Egypt', 'ae': 'United Arab Emirates', 'no': 'Norway', 'at': 'Austria', 'de': 'Germany', 'ch': 'Switzerland', 'rs': 'Serbia', 'tw': 'Taiwan', 'hk': 'Hong Kong', 'cn': 'China', 'be': 'Belgium', 'ma': 'Morocco', 'fr': 'France', 'br': 'Brazil', 'th': 'Thailand', 'id': 'Indonesia', 'ru': 'Russian Federation', 'lt': 'Lithuania', 'tr': 'Turkey', 'cz': 'Czechia', 'pt': 'Portugal', 'it': 'Italy', 'lv': 'Latvia', 'nl': 'Netherlands', 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'ua': 'Ukraine', 'bg': 'Bulgaria', 'il': 'Israel', 'se': 'Sweden', 'jp': 'Japan', 'gr': 'Greece', 'ro': 'Romania'}


In [6]:
langdet = load_json(root / 'langdetect.json')
def get_hint(art):
	return langdet.get(str(art['ID']))
len(langdet)

4719199

In [None]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

In [128]:
def as_batches(itr, n=50):
	itr = iter(itr)
	while True:
		batch = []
		try:
			for _ in range(n):
				batch.append(next(itr))
			yield batch
		except StopIteration:
			if batch:
				yield batch
			break

In [134]:
# ftlang = {}
# itr = as_batches((art for art in tqdm(articles) if art['ID'] not in ftlang), n=50)
# for batch in itr:
# 	labels, probs = model.predict([get_lang_query(art) for art in batch], k=2)
# 	for art, lbls, ps in zip(batch, labels, probs):
# 		ftlang[art['ID']] = dict(zip(lbls, ps.tolist()))
# save_json(ftlang, root / 'ftlang.json')
ftlang = {int(k): v for k,v in load_json(root / 'ftlang.json').items()}
ftlang_pred = {k: max(v.items(), key=lambda x: x[1])[0] for k,v in ftlang.items()}

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [183]:
def label2lang(label):
	lang = label[9:].split('_')[0]
	return languages.get(part3=lang).name

In [182]:
labels_cnt = Counter(ftlang_pred.values())
label_langs = [(lbl, label2lang(lbl), num) for lbl, num in labels_cnt.most_common()]
print(tabulate(label_langs, headers=['Label', 'Language', 'Count']))

Label              Language                         Count
-----------------  -----------------------------  -------
__label__eng_Latn  English                        1092624
__label__spa_Latn  Spanish                         453392
__label__fra_Latn  French                          285486
__label__yue_Hant  Yue Chinese                     267743
__label__deu_Latn  German                          259470
__label__por_Latn  Portuguese                      242882
__label__arb_Arab  Standard Arabic                 178470
__label__ita_Latn  Italian                         128449
__label__ind_Latn  Indonesian                      128142
__label__tur_Latn  Turkish                         122603
__label__ell_Grek  Modern Greek (1453-)            119404
__label__kor_Hang  Korean                          117264
__label__pol_Latn  Polish                          117092
__label__rus_Cyrl  Russian                         113108
__label__jpn_Jpan  Japanese                        104758
__label__nld_L

In [284]:
manlangs = {
	'__label__eng_Latn': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'__label__spa_Latn': {'ar', 've', 'co', 'cu', 'mx'}, 
	'__label__deu_Latn': {'at', 'de', 'ch'}, 
	'__label__fra_Latn': {'be', 'fr', 'ma'}, 
	# '__label__yue_Hant': {'tw', 'hk', 'cn'}, 
	'__label__por_Latn': {'pt', 'br'},
	'__label__arb_Arab': {'ae', 'eg'}, 
	'__label__ind_Latn': {'id'},
	'__label__jpn_Hani': {'jp'},
	'__label__kor_Hang': {'kr'},
	'__label__nld_Latn': {'nl'},
	'__label__ron_Latn': {'ro'},
	'__label__ita_Latn': {'it'},
	'__label__tur_Latn': {'tr'},
	'__label__rus_Cyrl': {'ru'},
	'__label__ukr_Cyrl': {'ua'},
	'__label__ell_Grek': {'gr'},
	'__label__bul_Cyrl': {'bg'},
	'__label__heb_Hebr': {'il'},
	'__label__pol_Latn': {'pl'},
	'__label__tha_Thai': {'th'},
	'__label__hun_Latn': {'hu'},
	'__label__ces_Latn': {'cz'},
	'__label__swe_Latn': {'se'},
	'__label__slk_Latn': {'sk'},
	'__label__nob_Latn': {'no'},
	'__label__lvs_Latn': {'lv'},
	'__label__lit_Latn': {'lt'},
	'__label__slv_Latn': {'si'},
	'__label__srp_Cyrl': {'rs'},
}
print(tabulate([(k,v) for k, v in country_names.items() if not any(k in langs for langs in manlangs.values())], headers=['Code', 'Name']))

Code    Name
------  ---------
tw      Taiwan
hk      Hong Kong
cn      China


In [285]:
# print(tabulate([(lbl, lang, cnt) for lbl, lang, cnt in label_langs if lbl not in manlangs and cnt > 10], headers=['Label', 'Language', 'Count']))

In [187]:
loclangs = {loc: Counter(ftlang_pred[art['ID']] for art in arts) for loc, arts in tqdm(by_loc.items())}
len(loclangs)

  0%|          | 0/54 [00:00<?, ?it/s]

54

In [366]:
loc = 'eg'
print(tabulate([(lbl, label2lang(lbl), num) for lbl, num in loclangs[loc].most_common()], headers=['Label', 'Language', 'Count']))

Label              Language             Count
-----------------  -----------------  -------
__label__arb_Arab  Standard Arabic     151276
__label__eng_Latn  English                240
__label__yue_Hant  Yue Chinese             15
__label__kor_Hang  Korean                   2
__label__zho_Hant  Chinese                  2
__label__azb_Arab  South Azerbaijani        2


In [369]:
most = loclangs[loc].most_common(1)[0][0]
# selids = [art['ID'] for art in by_loc[loc] if ftlang_pred[art['ID']] != most]
selids = [art['ID'] for art in by_loc[loc] if ftlang_pred[art['ID']] in {'__label__arb_Arab'}]
len(selids)

151276

In [370]:
print(tabulate(enumerate([by_ID[ID]['title'] for ID in selids[:100]]), headers=['Index', 'Title']))

  Index  Title
-------  --------------------------------------------------------------------------------------------------------------------
      0  شيماء سيف × 15 صورة من حفل زفافها وهكذا نشأت قصة الحب مع زوجها - اليوم السابع
      1  شيماء سيف × 15 صورة من حفل زفافها وهكذا نشأت قصة الحب مع زوجها - اليوم السابع
      2  شيماء سيف × 15 صورة من حفل زفافها وهكذا نشأت قصة الحب مع زوجها - اليوم السابع
      3  سرقة منزل رئيس نادي الأهلي المصري - RT Arabic
      4  سرقة منزل رئيس نادي الأهلي المصري - RT Arabic
      5  سرقة منزل رئيس نادي الأهلي المصري - RT Arabic
      6  85% نسبة الرطوبة.. الأرصاد تكشف تفاصيل طقس السبت - Masrawy-مصراوي
      7  ميدو: مصطفى محمد أفضل مهاجمي مصر.. وعملي الإعلامي يجبرني على الحياد - يلا كورة
      8  ميدو: مصطفى محمد أفضل مهاجمي مصر.. وعملي الإعلامي يجبرني على الحياد - يلا كورة
      9  ميدو: مصطفى محمد أفضل مهاجمي مصر.. وعملي الإعلامي يجبرني على الحياد - يلا كورة
     10  مؤتمر سيتيين: لم أفكر أبدا أن مباراة نابولي هي الأخيرة لي مع برشلونة - FilGoal.com
  

In [334]:
ID = random.choice(selids)
art = by_ID[ID]
view_article(art, detailed=True)
print(ftlang[ID])

Title: 'Mercedes-Benz CES 2021 keynote in 6 minutes - Engadget'
Categories: <business> <technology>   ---   'India', 'Saudi Arabia', 'United States' (12 Jan21 - 13 Jan21)
Published: 11 Jan21
Description: Get More Engadget: • Like us on Facebook: http://www.facebook.com/engadget• Follow us on Twitter: http://www.twitter.com/engadget• Follow us on Instagram: ht...
{'__label__yue_Hant': 0.9519753456115723, '__label__kor_Hang': 0.01803664304316044}


In [235]:
print(art['url'])

https://news.google.com/__i/rss/rd/articles/CBMiK2h0dHBzOi8vd3d3LnlvdXR1YmUuY29tL3dhdGNoP3Y9cEZHa3pUNUkxRlnSAQA?oc=5


In [147]:
hint = langdet.get(str(ID))
tld = parse_url(art).suffix
hint, tld

('ko', 'cc')

In [148]:
query = get_lang_query(art)
print(query)
detect(get_lang_query(art), hintLanguage=hint, hintTopLevelDomain=tld)

陳伶俐半山過億獨立屋曝光全雲石設計夠奢華 - on.cc東網藝人陳伶俐與音樂人林敏驄離婚後，於2012年帶着兩子改嫁新加坡隱形富豪胡嘉烈嘅後人劉頌銘，其後再為老公誕下一子，過着幸福


(True,
 240,
 (('ChineseT', 'zh-Hant', 96, 1901.0),
  ('Unknown', 'un', 0, 0.0),
  ('Unknown', 'un', 0, 0.0)))

In [12]:
langs = {}
bad = {}
itr = tqdm(articles)
for art in itr:
	if art['ID'] not in langs and art['ID'] not in bad:
		query = get_lang_query(art)
		hint = langdet.get(str(art['ID']))
		tld = parse_url(art).suffix
		query = get_lang_query(art)
		# ok, _, preds = detect(query, hintLanguage=hint, hintTopLevelDomain=tld)
		ok, _, preds = detect(query)
		_, code, conf, _ = max(preds, key=lambda x: (x[2],x[3]))
		if not ok or conf < 50:
			bad[art['ID']] = (query, ok, preds)
		else:
			langs[art['ID']] = code
len(langs), len(bad)

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [15]:
bad_ids = list(bad)
len(bad_ids)

184649

In [102]:
ID = random.choice(bad_ids)
ID = 17756
art = by_ID[ID]
view_article(art)

Title: 'Cosquín Rock Festival Online: comenzó la primera jornada de shows en vivo por streaming - LA NACION'
Categories: <entertainment> <general>   ---   'Argentina' (09 Aug20)
Published: 09 Aug20


In [104]:
hardest[ID]

'en'

In [105]:
hint = langdet.get(str(ID))
tld = parse_url(art).suffix
hint, tld

('es', 'com.ar')

In [109]:
query = get_lang_query(art)
print(query)
detect(get_lang_query(art), hintLanguage=hint, hintTopLevelDomain=tld)

Cosquín Rock Festival Online: comenzó la primera jornada de shows en vivo por streaming - LA NACION


(False,
 100,
 (('Unknown', 'un', 0, 0.0),
  ('Unknown', 'un', 0, 0.0),
  ('Unknown', 'un', 0, 0.0)))

In [114]:
detect(query, hintLanguage='es', hintTopLevelDomain='ar')

(False,
 100,
 (('Unknown', 'un', 0, 0.0),
  ('Unknown', 'un', 0, 0.0),
  ('Unknown', 'un', 0, 0.0)))

In [121]:
# import fasttext
# model = fasttext.load_model('lid.176.ftz')

In [None]:
print(model.predict(query, k=2))  # top 2 matching languages

In [116]:
import fasttext
from huggingface_hub import hf_hub_download

In [117]:
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
model = fasttext.load_model(model_path)

Downloading model.bin:   0%|          | 0.00/1.18G [00:00<?, ?B/s]



In [123]:
labels, probs = model.predict(query, k=2)
dict(zip(labels, probs.tolist()))

{'__label__spa_Latn': 0.9478647112846375,
 '__label__cat_Latn': 0.035578981041908264}

In [127]:
model.predict(['This is a test', query])

([['__label__eng_Latn'], ['__label__pol_Latn']],
 [array([1.00001], dtype=float32), array([1.0000077], dtype=float32)])

In [120]:
model

In [49]:
bad_todo = bad_ids
hard = {}
still_bad = {}

In [59]:
for ID in tqdm(bad_todo):
	art = by_ID[ID]
	hint = langdet.get(str(ID))
	tld = parse_url(art).suffix
	query = get_lang_query(art)
	ok, _, preds = detect(query, hintLanguage=hint, hintTopLevelDomain=tld)
	# ok, _, preds = detect(query, hintTopLevelDomain=tld)
	_, code, conf, _ = max(preds, key=lambda x: (x[2],x[3]))
	if not ok:
		still_bad[art['ID']] = (query, ok, preds)
	else:
		hard[art['ID']] = code
len(hard), len(still_bad)

  0%|          | 0/100853 [00:00<?, ?it/s]

(84220, 100429)

In [58]:
bad_todo = still_bad
prev_bad = still_bad
still_bad = {}

In [66]:
len(bad_todo)

100853

In [68]:
loc2lang = {}
for a in tqdm(articles):
	lang = langs.get(a['ID'])
	if lang:
		for i in a['instances']:
			loc2lang.setdefault(i['location'], []).append(lang)
loc2lang = {k: Counter(v) for k,v in loc2lang.items()}

  0%|          | 0/4719199 [00:00<?, ?it/s]

In [75]:
badder_todo = bad_todo
hardest = {}
baddest = []

In [76]:
for ID in tqdm(badder_todo):
	art = by_ID[ID]
	
	hints = Counter()
	for i in art['instances']:
		hints.update(loc2lang[i['location']])
	
	query = get_lang_query(art)
	for hint, _ in hints.most_common():
		ok, _, preds = detect(query, hintLanguage=hint)
		# ok, _, preds = detect(query, hintTopLevelDomain=tld)
		_, code, conf, _ = max(preds, key=lambda x: (x[2],x[3]))
		if ok:
			hardest[art['ID']] = code
			break
	else:
		baddest.append(art['ID'])
len(hardest), len(baddest)

  0%|          | 0/100853 [00:00<?, ?it/s]

(38395, 62458)

In [86]:
bad_locs = Counter([i['location'] for ID in baddest for i in by_ID[ID]['instances']])
print(tabulate(bad_locs.most_common(), headers=['Location', 'Count']))

Location      Count
----------  -------
hk             9160
ru             7420
gr             6393
jp             6230
tw             5975
cu             5922
cn             5684
kr             5669
mx             4018
co             3847
ve             3622
ar             3080
id             2438
bg             2091
it             1988
il             1862
br             1861
pt             1690
ua             1607
rs             1206
th             1194
ma             1117
at              670
be              658
se              569
ae              542
fr              471
de              430
no              409
si              381
eg              371
nl              318
ro              291
ch              212
tr              195
pl              137
ph              113
hu              112
sk              103
au               91
ng               89
in               84
cz               71
ca               57
za               47
sa               40
my               37
sg               35


In [96]:
loc = 'ro'

In [97]:
print(tabulate(loc2lang[loc].most_common(), headers=['Language', 'Count']))

Language      Count
----------  -------
ro            81476
en              264
es                3
ie                3
it                2


In [99]:
titles = []
for ID in baddest:
	art = by_ID[ID]
	if any(i['location'] == loc for i in art['instances']):
		titles.append(art['title'])
print(tabulate(enumerate(titles), headers=['Index', 'Title']))

  Index  Title
-------  ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
      0  Cum a reacționat soția lui Bill Gates la donația bossului Facebook - Click!
      1  SIMONA HALEP - PATRICIA HERCOG LIVE SCORE la Praga, primul tur - DCNews
      2  OFICIAL | Villarreal l-a transferat pe căpitanul Valenciei! Unai Emery visează la un nou trofeu Europa League - DigiSport
      3  Hyundai Kona Electric, record de autonomie. Câți kilometri a parcurs? - ProMotor
      4  LIVE VIDEO Irina Begu - Leonie Kung și Ana Bogdan - Kristyna Pliskova, de la 12:00, pe Digi Sport 1 și Digi Sport 2 - DigiSport
      5  Evenimente - Prahova
      6  OnePlus va rula pe smartphone-ul bugetar cu Qualcomm Snapdragon 460 la bord: Gadget.ro - stil de viață hi-tech - ProRally.ro
      7  UTA - FC Voluntari 0-0 - ARADON - Aradon
      8  ”Un vis devenit re

In [179]:
merged = set(baddest).intersection(set(low_conf))
len(merged)

9297

In [23]:
from urllib.parse import urlparse

# Replace with your URL
url = "https://www.example.com/path/to/page?query=param"
url = art['url']

# Parse the URL
parsed_url = urlparse(url)

# Get the domain
domain = parsed_url.netloc

print(domain)


www.punto-informatico.it


In [24]:

# Replace with your URL
# url = "https://www.example.com/path/to/page?query=param"

# Extract parts of the URL


it


In [None]:
detect(get_lang_query(art), hintTopLevelDomain='it')

In [347]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import pipeline


In [348]:
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-id-en")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/291M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/796k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.26M [00:00<?, ?B/s]



In [364]:
texts = [by_ID[ID]['title'] for ID in selids]
outs_raw = pipe(texts)
outs = [out['translation_text'] for out in outs_raw]

In [365]:
print(tabulate(zip(texts, outs), headers=['Input', 'Output']))

Input                                                                                           Output
----------------------------------------------------------------------------------------------  ------------------------------------------------------------------------------------
Ayu Ting Ting Didesak Minta Maaf ke Nagita Slavina, Umi Kalsum Naik Pitam - Suara.com           Ayu Ting Ting Di Urge Apologize to Nagita Slavina, Umi Kalsum Ride Pitam - Voice.com
Pelawak Ginanjar Takut tapi Bahagia Bakal Jadi Ayah di Usia 56 Tahun - Kompas.com - KOMPAS.com  Ginanjar's Feared but Happy Will Be Father at 56 - Compass.com - COM.com
Innalilahi Wainna Ilaihi Rajiun, Ayu Ting Ting Berduka | merdeka.com - Merdeka.com              Inollalullahi Wainna Ilaihi Rajiun, Ayu Ting Bergura independent.com - Merdeka.com
Istri Randy Pangalila Melahirkan, Wajah Bayi Bikin Asmirandah Terpana - InsertLive              Randy's Wife Pangalila gave birth, Baby Face Makes a Fascination - InsertLive
Nyatanya, Jad

In [371]:
from googletrans import Translator
translator = Translator()
translator.translate('안녕하세요.')

AttributeError: 'NoneType' object has no attribute 'group'

In [390]:
def as_batches(itr, n=50):
	itr = iter(itr)
	while True:
		batch = []
		try:
			for _ in range(n):
				batch.append(next(itr))
			yield batch
		except StopIteration:
			if batch:
				yield batch
			break

In [407]:
prompt_delimiter = '%&%'
line_template = '<p>{prompts}</p>'

In [413]:
template = '<tr><td>{ID}</td><td>{title}</td></tr>\n'
template = '{ID}{delimiter}{title}'
template = '{ID}%&%{title}'

In [375]:
len(by_loc)

54

In [376]:
(root/'prompts').mkdir(exist_ok=True)

In [378]:
dupes = {}
for loc, arts in tqdm(by_loc.items()):
	for art in arts:
		dupes.setdefault(loc, {}).setdefault(art['title'], []).append(art['ID'])

  0%|          | 0/54 [00:00<?, ?it/s]

In [414]:
for loc, title_ids in tqdm(dupes.items()):
	if loc == 'ua':
		lines = ['\n\n'.join(template.format(ID=','.join(map(str,set(ids))), title=title.replace('\t', ' ')) 
						   for title, ids in batch) 
				 for batch in as_batches(title_ids.items(), n=10)]
		lines = lines[:100]
		with (root / 'dense' /f'{loc}-prompt-rows.txt').open('w') as f:
			f.write('\n\n'.join(lines))
			
			
			# for i, (title, ids) in enumerate(title_ids.items()):
			# 	f.write(template.format(ID=','.join(map(str,set(ids))), title=title))
			# 	if i > 0 and i % 10 == 0:
			# 		f.write('\n')

  0%|          | 0/54 [00:00<?, ?it/s]