In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
from tabulate import tabulate
import random
import pycountry
import networkx as nx
from dateutil import parser
from datetime import datetime
from collections import Counter
from langdetect import detect
from omnibelt import load_json, save_json
# from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
# from transformers import pipeline

In [2]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root/'global-news-headlines').glob('**/*.json')
recs = list(recs)
len(recs)

54

In [4]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [5]:
by_loc = {}
for article in tqdm(articles):
	for instance in article['instances']:
		by_loc.setdefault(instance['location'], []).append(article)
country_names = {code: pycountry.countries.get(alpha_2=code.upper()).name.split(',')[0] for code in by_loc}
clusters = {
	'english': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'spanish': {'ar', 've', 'co', 'cu', 'mx'}, 
	'arabic': {'ae', 'eg'}, 
	'german': {'at', 'de', 'ch'}, 
	'chinese': {'tw', 'hk', 'cn'}, 
	'french': {'be', 'fr', 'ma'}, 
	'portuguese': {'pt', 'br'},
}
to_cluster = {country_names[loc]: cluster for cluster, locs in clusters.items() for loc in locs}
len(by_loc)

  0%|          | 0/4719199 [00:00<?, ?it/s]

54

In [17]:
def show_date(date):
	return date.strftime('%d %b%y')
def get_locs(article):
	return [f'{country_names[loc]}' for loc in sorted(set(i['location'] for i in article['instances']))]
def get_cats(article):
	return [f'<{cat}>' for cat in sorted(set(i['category'] for i in article['instances']))]
def view_article(art, detailed=False):
	cats = ' '.join(get_cats(art))
	locs = ', '.join(map(repr,get_locs(art)))
	if 'published' not in art:
		art['published'] = parser.parse(art['publishedAt'])
	for i in art['instances']:
		if 'collected' not in i:
			i['collected'] = parser.parse(i['collectedAt'])
	first = min(i['collected'] for i in art['instances'])
	last = max(i['collected'] for i in art['instances'])
	timing = f'{show_date(first)}' if first==last else f'{show_date(first)} - {show_date(last)}'
	print(f'''Title: {art['title']!r}
Categories: {cats}   ---   {locs} ({timing})
Published: {show_date(art['published'])}'''
	) # ({art['source-name']})
	if detailed:
		print(f'''Description: {art['description']}''')
		# print(f'''Content: {art['content']}''')

In [2]:
# Use a pipeline as a high-level helper
pipe = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
batch = random.choices(articles, k=10)


In [22]:
sel = [a for a in articles if a['langdetect'] == 'cs']

In [23]:
art = batch[0]
# art = random.choice(by_loc['bg'])
art = random.choice(sel)
view_article(art, detailed=True)

Title: 'Pohřeb Andreje Hryce, hvězdy Ulice - eXtra.cz'
Categories: <entertainment> <general>   ---   'Czechia' (08 Feb21 - 09 Feb21)
Published: 07 Feb21
Description: Ve věku jedenasedmdesáti let odešel do uměleckého nebe herec Andy Hryc,
který 31. ledna prohrál boj se zákeřnou nemocí. O tragické události
informovala jeho dcera Wanda Adamík Hrycová na sociálních sítích, nyní
sdělila i informace o posledním rozloučení s her…


In [24]:
print(pipe(art['title']))

[{'label': 'tr', 'score': 0.241484597325325}]


In [21]:
langs = Counter(a['langdetect'] for a in articles)

--  -------
en  1202138
es   454544
fr   287983
de   263461
pt   243874
ar   176290
ko   164654
id   130952
it   130541
tr   122706
pl   117532
el   117423
ja   109040
ru   108779
nl   104251
th    87606
sv    86923
hr    74282
hu    73416
cs    70629
--  -------


In [25]:
print(tabulate(langs.most_common(50)))

-----  -------
en     1202138
es      454544
fr      287983
de      263461
pt      243874
ar      176290
ko      164654
id      130952
it      130541
tr      122706
pl      117532
el      117423
ja      109040
ru      108779
nl      104251
th       87606
sv       86923
hr       74282
hu       73416
cs       70629
bg       69558
he       67420
uk       62629
zh-cn    59522
ro       56366
no       54861
sk       42993
lv       39984
lt       34948
sl       34355
zh-tw    24764
vi       19535
et        8550
mk        5177
ca        5151
bn        2174
af        1039
tl         943
da         812
so         768
sw         218
cy         193
fi         102
sq          99
fa           6
ur           5
hi           3
-----  -------


In [33]:
from src.common import LANGUAGE_CODES

AttributeError: module 'omnifig' has no attribute 'Script'

In [34]:
LANGUAGE_CODES.get('es')

NameError: name 'LANGUAGE_CODES' is not defined

In [None]:

article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")


In [42]:
import pycld2 as cld2

isReliable, textBytesFound, details = cld2.detect(
    "а неправильный формат идентификатора дн назад",
	# returnVectors=True
)

print(isReliable)
# True
details[0]
# ('RUSSIAN', 'ru', 98, 404.0)

True


('RUSSIAN', 'ru', 98, 404.0)

In [43]:
details

(('RUSSIAN', 'ru', 98, 404.0),
 ('Unknown', 'un', 0, 0.0),
 ('Unknown', 'un', 0, 0.0))

In [44]:


fr_en_Latn = """\
France is the largest country in Western Europe and the third-largest in Europe as a whole.
A accès aux chiens et aux frontaux qui lui ont été il peut consulter et modifier ses collections
et exporter Cet article concerne le pays européen aujourd’hui appelé République française.
Pour d’autres usages du nom France, Pour une aide rapide et effective, veuiller trouver votre aide
dans le menu ci-dessus.
Motoring events began soon after the construction of the first successful gasoline-fueled automobiles.
The quick brown fox jumped over the lazy dog."""

isReliable, textBytesFound, details, vectors = cld2.detect(
    fr_en_Latn, returnVectors=True
)
print(vectors)
# ((0, 94, 'ENGLISH', 'en'), (94, 329, 'FRENCH', 'fr'), (423, 139, 'ENGLISH', 'en'))

((0, 94, 'ENGLISH', 'en'), (94, 329, 'FRENCH', 'fr'), (423, 139, 'ENGLISH', 'en'))


In [45]:
details

(('FRENCH', 'fr', 58, 883.0),
 ('ENGLISH', 'en', 41, 1148.0),
 ('Unknown', 'un', 0, 0.0))

In [1]:
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>News Headlines</title>
	<style>
		body {{
			font-family: Arial, sans-serif;
			margin: 20px;
		}}
		.headline {{
			padding: 5px;
			border-bottom: 1px solid #ddd;
			display: flex;
			justify-content: space-between;
		}}
		.title {{
			flex: 2;
		}}
		.ids {{
			flex: 1;
			text-align: right;
		}}
	</style>
</head>
<body>
	<div id="news-container">
		{headlines}
	</div>
</body>
</html>
"""

In [2]:
from pathlib import Path

In [4]:
root = Path('/Users/homeworld/workspace/local_data/nnn')
print(list(p.name for p in root.glob('*')))

['temp', 'raw_news', 'full.json', 'ftlang.json', 'langdetect.json', 'assets', 'global-news-headlines', 'global-news-headlines.zip']


In [None]:
def generate_html_from_dict(headlines_dict):
    # Base HTML template
    # Generate headlines
    headlines = ""
    for headline, ids in headlines_dict.items():
        ids_str = ', '.join(map(str, ids))
        headlines += f'<div class="headline"><span class="title">{headline}</span><span class="ids">[{ids_str}]</span></div>\n'
    
    # Insert headlines into the HTML template
    return html_template.format(headlines=headlines)

# Sample headlines dictionary for testing
sample_headlines_dict = {
    "Breaking: Earthquake in Paris": [101, 102, 103],
    "Rain causes uproar in London": [104, 105],
    "London witnesses Earthquake phenomenon": [106]
}

# Generate the HTML content
html_content_from_dict = generate_html_from_dict(sample_headlines_dict)
html_content_from_dict[:1000]  # Displaying only the first 1000 characters for brevity
