In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
from tabulate import tabulate
import random
# import pycountry
# import networkx as nx
from dateutil import parser
from datetime import datetime
from collections import Counter
# from langdetect import detect
from omnibelt import load_json, save_json

In [10]:
localroot = Path().absolute().parent
localroot

PosixPath('/Users/homeworld/workspace/code/nnn')

In [2]:
root = Path('/Users/homeworld/workspace/local_data/nnn')
print(list(p.name for p in root.glob('*')))

['temp', 'raw_news', 'full.json', 'ftlang.json', 'langdetect.json', 'assets', 'global-news-headlines', 'global-news-headlines.zip']


In [3]:
recs = (root/'global-news-headlines').glob('**/*.json')
recs = list(recs)
len(recs)

54

In [4]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [5]:
by_loc = {}
for article in tqdm(articles):
	for instance in article['instances']:
		by_loc.setdefault(instance['location'], []).append(article)
country_names = {code: pycountry.countries.get(alpha_2=code.upper()).name.split(',')[0] for code in by_loc}
clusters = {
	'english': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'spanish': {'ar', 've', 'co', 'cu', 'mx'}, 
	'arabic': {'ae', 'eg'}, 
	'german': {'at', 'de', 'ch'}, 
	'chinese': {'tw', 'hk', 'cn'}, 
	'french': {'be', 'fr', 'ma'}, 
	'portuguese': {'pt', 'br'},
}
to_cluster = {country_names[loc]: cluster for cluster, locs in clusters.items() for loc in locs}
len(by_loc)

  0%|          | 0/4719199 [00:00<?, ?it/s]

NameError: name 'pycountry' is not defined

In [None]:
def show_date(date):
	return date.strftime('%d %b%y')
def get_locs(article):
	return [f'{country_names[loc]}' for loc in sorted(set(i['location'] for i in article['instances']))]
def get_cats(article):
	return [f'<{cat}>' for cat in sorted(set(i['category'] for i in article['instances']))]
def view_article(art, detailed=False):
	cats = ' '.join(get_cats(art))
	locs = ', '.join(map(repr,get_locs(art)))
	if 'published' not in art:
		art['published'] = parser.parse(art['publishedAt'])
	for i in art['instances']:
		if 'collected' not in i:
			i['collected'] = parser.parse(i['collectedAt'])
	first = min(i['collected'] for i in art['instances'])
	last = max(i['collected'] for i in art['instances'])
	timing = f'{show_date(first)}' if first==last else f'{show_date(first)} - {show_date(last)}'
	print(f'''Title: {art['title']!r}
Categories: {cats}   ---   {locs} ({timing})
Published: {show_date(art['published'])}'''
	) # ({art['source-name']})
	if detailed:
		print(f'''Description: {art['description']}''')
		# print(f'''Content: {art['content']}''')

In [24]:
import json

In [26]:
loc = 'hk'

In [6]:
articles = load_json(root / 'global-news-headlines' / f'news-headlines-{loc}.json')
len(articles)

82324

In [22]:
unique = {}
for article in tqdm(articles):
	unique.setdefault((article['title'], article.get('description')), []).append(article['ID'])
len(unique)

  0%|          | 0/82324 [00:00<?, ?it/s]

80449

In [30]:
lines = [json.dumps({'title': title, 'description': desc, 'ids': ids}) for (title, desc), ids in unique.items()]
# (localroot/'interface'/'clean_news'/f'prompts-{loc}.jsonl').write_text('\n'.join(lines[:100]));

In [8]:
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>News Headlines</title>
	<style>
		body {{
			font-family: Arial, sans-serif;
			margin: 20px;
		}}
		.headline {{
			padding: 5px;
			border-bottom: 1px solid #ddd;
			display: flex;
			justify-content: space-between;
		}}
		.title {{
			flex: 2;
		}}
		.ids {{
			flex: 1;
			text-align: right;
		}}
		#saveButton {{
			position: absolute;
			top: 10px;
			left: 10px;
		}}
	</style>
</head>
<body>
	<button id="saveButton">Save & Next</button>
	<div id="news-container">
		{headlines}
	</div>
	<script>
		let currentIndex = 0;
		const N = {N};
		const headlines = document.querySelectorAll('.headline');

		// Initial display of N headlines
		for(let i = 0; i < headlines.length; i++) {{
			if (i < N) {{
				headlines[i].style.display = 'flex';
			}} else {{
				headlines[i].style.display = 'none';
			}}
		}}

		document.getElementById('saveButton').addEventListener('click', function() {{
			let textToSave = '';
			for (let i = currentIndex; i < currentIndex + N; i++) {{
				if (headlines[i]) {{
					textToSave += headlines[i].innerText + '\\n';
					headlines[i].style.display = 'none';
				}}
			}}
			currentIndex += N;
			for (let i = currentIndex; i < currentIndex + N; i++) {{
				if (headlines[i]) {{
					headlines[i].style.display = 'flex';
				}}
			}}
			
			// Simulate saving to a file
			const blob = new Blob([textToSave], {{type: 'text/plain'}});
			const a = document.createElement('a');
			a.href = URL.createObjectURL(blob);
			a.download = 'saved_headlines.txt';
			document.body.appendChild(a);
			a.click();
			document.body.removeChild(a);
		}});
	</script>
</body>
</html>
"""

In [16]:
def generate_html_with_js(headline_info, N=5):
    # Generate headlines
    headlines = ""
    for headline, ids in headline_info:
        ids_str = ', '.join(map(str, ids))
        headlines += f'<div class="headline"><span class="title">{headline}</span><span class="ids">[{ids_str}]</span></div>\n'
    
    # Insert headlines and N into the HTML template
    return html_template.format(headlines=headlines, N=N)

# Generate the HTML content with JavaScript functionality
# html_content_with_js = generate_html_with_js(sample_headlines_dict)
# html_content_with_js[:1000]  # Displaying only the first 1000 characters for brevity


In [23]:
import json

In [None]:
lines = [json.dumps({'title': title, 'description': desc, 'ids': ids}) for (title, desc), ids in unique.items()]


In [12]:
full = list(unique.items())

In [21]:
(root / 'prompts')

In [17]:
html_content_with_js = generate_html_with_js(full[:100])

In [None]:
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>News Headlines</title>
	<style>
		body {{
			font-family: Arial, sans-serif;
			margin: 20px;
		}}
		.headline {{
			padding: 5px;
			border-bottom: 1px solid #ddd;
			display: flex;
			justify-content: space-between;
		}}
		.title {{
			flex: 2;
		}}
		.ids {{
			flex: 1;
			text-align: right;
		}}
	</style>
</head>
<body>
	<div id="news-container">
		{headlines}
	</div>
</body>
</html>
"""
def generate_html_from_dict(headlines_dict):
    # Base HTML template
    # Generate headlines
    headlines = ""
    for headline, ids in headlines_dict.items():
        ids_str = ', '.join(map(str, ids))
        headlines += f'<div class="headline"><span class="title">{headline}</span><span class="ids">[{ids_str}]</span></div>\n'
    
    # Insert headlines into the HTML template
    return html_template.format(headlines=headlines)

# Sample headlines dictionary for testing
sample_headlines_dict = {
    "Breaking: Earthquake in Paris": [101, 102, 103],
    "Rain causes uproar in London": [104, 105],
    "London witnesses Earthquake phenomenon": [106]
}

# Generate the HTML content
html_content_from_dict = generate_html_from_dict(sample_headlines_dict)
html_content_from_dict[:1000]  # Displaying only the first 1000 characters for brevity


In [None]:
def generate_html_with_js_and_file_io(N=5):
    # Base HTML template with file input and JavaScript for file handling

    
    return html_template.format(N=N)

# Generate the HTML content with JavaScript functionality for file I/O
html_content_with_file_io = generate_html_with_js_and_file_io()
html_content_with_file_io[:1000]  # Displaying only the first 1000 characters for brevity
