In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
from tabulate import tabulate
import random
import pycountry
import networkx as nx
from dateutil import parser
from datetime import datetime
from collections import Counter
from langdetect import detect
from omnibelt import load_json, save_json

In [2]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root/'global-news-headlines').glob('**/*.json')
recs = list(recs)
len(recs)

54

In [3]:
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [4]:
by_loc = {}
for article in tqdm(articles):
	for instance in article['instances']:
		by_loc.setdefault(instance['location'], []).append(article)
country_names = {code: pycountry.countries.get(alpha_2=code.upper()).name.split(',')[0] for code in by_loc}
clusters = {
	'english': {'sa', 'ie', 'sg', 'us', 'ph', 'au', 'my', 'za', 'in', 'nz', 'ca', 'gb', 'ng'}, 
	'spanish': {'ar', 've', 'co', 'cu', 'mx'}, 
	'arabic': {'ae', 'eg'}, 
	'german': {'at', 'de', 'ch'}, 
	'chinese': {'tw', 'hk', 'cn'}, 
	'french': {'be', 'fr', 'ma'}, 
	'portuguese': {'pt', 'br'},
}
to_cluster = {country_names[loc]: cluster for cluster, locs in clusters.items() for loc in locs}
len(by_loc)

  0%|          | 0/4719199 [00:00<?, ?it/s]

54

In [5]:
def show_date(date):
	return date.strftime('%d %b%y')
def get_locs(article):
	return [f'{country_names[loc]}' for loc in sorted(set(i['location'] for i in article['instances']))]
def get_cats(article):
	return [f'<{cat}>' for cat in sorted(set(i['category'] for i in article['instances']))]
def view_article(art, detailed=False):
	cats = ' '.join(get_cats(art))
	locs = ', '.join(map(repr,get_locs(art)))
	first = min(i['collected'] for i in art['instances'])
	last = max(i['collected'] for i in art['instances'])
	timing = f'{show_date(first)}' if first==last else f'{show_date(first)} - {show_date(last)}'
	print(f'''Title: {art['title']!r}
Categories: {cats}   ---   {locs} ({timing})
Published: {show_date(art['published'])}'''
	) # ({art['source-name']})
	if detailed:
		print(f'''Description: {art['description']}''')
		# print(f'''Content: {art['content']}''')

In [7]:
world = [a for a in articles]# if a['langdetect'] == 'en']
for a in tqdm(world):
	if 'published' not in a and isinstance(a['publishedAt'], str):
		a['published'] = parser.parse(a['publishedAt'])
	for i in a['instances']:
		if 'collected' not in i and isinstance(i['collectedAt'], str):
			i['collected'] = parser.parse(i['collectedAt'])
len(world)

  0%|          | 0/4719199 [00:00<?, ?it/s]

4719199

In [8]:
locs = Counter([i['location'] for a in world for i in a['instances']])
print(tabulate([country_names[key], count, count/len(by_loc[key])] for key, count in locs.most_common(10)))
len(locs)

-----------  ------  -
Nigeria      177832  1
Philippines  176976  1
Morocco      176357  1
Brazil       175726  1
Cuba         175488  1
Canada       173911  1
India        173891  1
Australia    172501  1
Argentina    169860  1
Italy        169696  1
-----------  ------  -


54

In [35]:
art = random.choice(world)
view_article(art, True)
ID = art['ID']
ID

Title: '«Тонкая талия и бесконечные ноги»: Ходченкова показала идеальную фигуру в бикини - Комсомольская правда'
Categories: <entertainment>   ---   'Russian Federation' (16 Aug21)
Published: 15 Aug21
Description: Звезда нежится на пляже, готовясь к съемкам в «Анне Карениной»


3759035

In [17]:
query = 'Aung San Suu Kyi'
query = 'El Salvador'
query = 'Jovenel Moïse'
matches = [a for a in tqdm(world) if query in a['title']]
len(matches)

  0%|          | 0/4719199 [00:00<?, ?it/s]

30

In [18]:
print(tabulate([[show_date(a['published']), 
				 # '\n'.join(get_locs(a)), 
				 '\n'.join(sorted(set(map(lambda x: to_cluster.get(x,x), get_locs(a))))),
				 a['title'][:120]
				 ] for a in sorted(matches, key=lambda a: a['published'])]))

--------  ----------  ------------------------------------------------------------------------------------------------------------------------
07 Jul21  french      Le président haïtien Jovenel Moïse assassiné chez lui par un commando, la Maison Blanche prête à «assister - Sudinfo.be
07 Jul21  spanish     Lamenta y condena Díaz-Canel asesinato del presidente haitiano, Jovenel Moïse - Cuba.cu
08 Jul21  german      Jovenel Moïse: Festnahmen nach Präsidentenmord in Haiti - ZEIT ONLINE
08 Jul21  english     What we know about the assassination of Haiti's President Jovenel Moïse - ABC News
08 Jul21  french      Haïti : quatre «mercenaires» tués, deux arrêtés après l'assassinat de Jovenel Moïse - Le Figaro
08 Jul21  spanish     Los asesinos del presidente de Haití, Jovenel Moïse, eran "mercenarios vestidos como agentes de la DEA" - Página 12
09 Jul21  spanish     Gobierno de Haití confirma que 28 mercenarios están involucrados en el magnicidio de Jovenel Moïse - teleSUR TV
09 Jul21  spanish 

In [37]:
# locs = {a['ID']: get_locs(a) for a in tqdm(world)}
# len(locs)

In [None]:
loc_ids = {loc: set(a['ID'] for a in by_loc[loc]) for loc in tqdm(by_loc)}
len(loc_ids)

In [39]:
from itertools import combinations

In [41]:
pair_counts = Counter()
for loc_a, loc_b in tqdm(combinations(loc_ids.keys(), 2), total=len(loc_ids)*(len(loc_ids)-1)/2):
	pair = frozenset([loc_a, loc_b])
	if pair in pair_counts:
		continue
	pair_counts[pair] = len(loc_ids[loc_a] & loc_ids[loc_b])
len(pair_counts)

  0%|          | 0/1431.0 [00:00<?, ?it/s]

1431

In [42]:
print(tabulate([[country_names[loc_a], country_names[loc_b], count] for (loc_a, loc_b), count in pair_counts.most_common(10)]))

------------  -----------  -----
Saudi Arabia  Canada       98791
Philippines   Nigeria      54747
Philippines   Malaysia     44820
Singapore     Malaysia     42380
Nigeria       Malaysia     40625
New Zealand   Philippines  38082
New Zealand   Nigeria      38015
New Zealand   Malaysia     37114
Singapore     Philippines  36467
South Africa  Nigeria      36007
------------  -----------  -----


In [117]:
def get_neighbors(loc):
	return Counter({loc_b if loc_a == loc else loc_a: count for (loc_a, loc_b), count in pair_counts.items() if loc_a == loc or loc_b == loc})

loners = [loc for loc in loc_ids if get_neighbors(loc).most_common(1)[0][1] == 0]
members = [loc for loc in loc_ids if loc not in loners]
print(f'{len(loners)} loners, {len(members)} members')

23 loners, 31 members


In [199]:
print({loc:country_names[loc] for loc in loners})
print({loc:country_names[loc] for loc in members})

{'pl': 'Poland', 'sk': 'Slovakia', 'no': 'Norway', 'rs': 'Serbia', 'th': 'Thailand', 'id': 'Indonesia', 'ru': 'Russian Federation', 'lt': 'Lithuania', 'tr': 'Turkey', 'cz': 'Czechia', 'it': 'Italy', 'lv': 'Latvia', 'nl': 'Netherlands', 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'ua': 'Ukraine', 'bg': 'Bulgaria', 'il': 'Israel', 'se': 'Sweden', 'jp': 'Japan', 'gr': 'Greece', 'ro': 'Romania'}
{'gb': 'United Kingdom', 'ca': 'Canada', 'za': 'South Africa', 'sa': 'Saudi Arabia', 'us': 'United States', 'ie': 'Ireland', 'my': 'Malaysia', 'nz': 'New Zealand', 'sg': 'Singapore', 'ng': 'Nigeria', 'ph': 'Philippines', 'au': 'Australia', 'in': 'India', 'ar': 'Argentina', 'mx': 'Mexico', 'co': 'Colombia', 'cu': 'Cuba', 've': 'Venezuela', 'eg': 'Egypt', 'ae': 'United Arab Emirates', 'at': 'Austria', 'de': 'Germany', 'ch': 'Switzerland', 'tw': 'Taiwan', 'hk': 'Hong Kong', 'cn': 'China', 'be': 'Belgium', 'ma': 'Morocco', 'fr': 'France', 'br': 'Brazil', 'pt': 'Portugal'}


In [200]:
print(tabulate([[country_names[loc], count, count/len(loc_ids[loc])] for loc, count in get_neighbors('ro').most_common(15)]))

--------------  -  -
United Kingdom  0  0
Canada          0  0
South Africa    0  0
Saudi Arabia    0  0
United States   0  0
Ireland         0  0
Malaysia        0  0
New Zealand     0  0
Singapore       0  0
Nigeria         0  0
Philippines     0  0
Australia       0  0
India           0  0
Argentina       0  0
Mexico          0  0
--------------  -  -


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
from collections import defaultdict
from itertools import combinations

In [178]:

# Create a graph
G = nx.Graph()

# Add edges to the graph with weights representing the number of shared articles
for pair, count in pair_counts.items():
	country_a, country_b = pair
	if count > 5:
		G.add_edge(country_names[country_a], country_names[country_b], weight=count)


In [205]:
clusters = []
for (loc_a, loc_b), w in pair_counts.items():
	if w < 5:
		continue
	for candidate in clusters:
		if loc_a in candidate or loc_b in candidate:
			candidate.add(loc_a)
			candidate.add(loc_b)
			break
	else:
		clusters.append({loc_a, loc_b})
len(clusters)

7

In [173]:
init_pos = nx.random_layout(G)

pos = nx.spring_layout(G, k=300, iterations=1000, pos=init_pos)
# pos = init_pos

In [193]:
# # Draw the graph
# plt.figure(figsize=(10,8))
# # pos = nx.circular_layout(G)  # positions for all nodes
# 
# nx.draw_networkx_nodes(G, pos, node_size=700)
# nx.draw_networkx_edges(G, pos, width=[np.log10(d['weight'])/5 if d['weight'] > 0 else 0 for _, _, d in G.edges(data=True)])
# nx.draw_networkx_labels(G, pos, font_size=20, font_family='sans-serif')
# 
# plt.title('Shared Articles Between Countries')
# # plt.show()


In [56]:
# min(pair_counts.values()), max(pair_counts.values())

(0, 98791)

In [79]:
country_names['ma']

'Morocco'

In [186]:
# nx.draw(G, with_labels=True);

In [194]:
# import plotly.graph_objects as go
# from collections import defaultdict
# from itertools import combinations
# 
# country_pairs = pair_counts
# 
# # Create a network graph
# import networkx as nx
# G = nx.Graph()
# for pair, count in country_pairs.items():
#     country_a, country_b = pair
#     G.add_edge(country_a, country_b, weight=count)
# 
# # Generate the spring layout positions
# pos = nx.spring_layout(G)
# 
# # Extract positions, edges and weights
# G = nx.Graph()
# for pair, count in country_pairs.items():
#     country_a, country_b = pair
#     G.add_edge(country_names[country_a], country_names[country_b], weight=count)
# 
# # Get positions for the nodes in G
# pos = nx.spring_layout(G)
# 
# # Create edge traces
# edge_traces = []
# for edge in G.edges(data=True):
#     weight = edge[2]['weight']
#     x0, y0 = pos[edge[0]]
#     x1, y1 = pos[edge[1]]
#     edge_trace = go.Scatter(
#         x=[x0, x1, None],
#         y=[y0, y1, None],
#         line=dict(width=weight*2, color='#888'),
#         hoverinfo='none',
#         mode='lines'
#     )
#     edge_traces.append(edge_trace)
# 
# # Create node trace
# node_x = []
# node_y = []
# for node in G.nodes():
#     x, y = pos[node]
#     node_x.append(x)
#     node_y.append(y)
# 
# node_trace = go.Scatter(
#     x=node_x, y=node_y,
#     mode='markers',
#     hoverinfo='text',
#     marker=dict(
#         showscale=True,
#         colorbar=dict(
#             thickness=15,
#             title='Node Connections',
#             xanchor='left',
#             titleside='right'
#         ),
#         line_width=2))
# 
# # Update node hover information
# node_adjacencies = []
# node_text = []
# for node, adjacencies in enumerate(G.adjacency()):
#     node_adjacencies.append(len(adjacencies[1]))
#     node_text.append(f'{adjacencies[0]}, # of connections: {len(adjacencies[1])}')
# 
# node_trace.marker.color = node_adjacencies
# node_trace.text = node_text
# 
# # Create the figure
# fig = go.Figure(data=edge_traces + [node_trace],
#                 layout=go.Layout(
#                     showlegend=False,
#                     hovermode='closest',
#                     margin=dict(b=0, l=0, r=0, t=0),
#                     xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
#                     yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
#                 )
# # fig.show()

In [None]:
import pandas as pd
df = pd.DataFrame(list(G.edges(data=True)), columns=['source', 'target', 'Attributes'])
df = pd.concat([df.drop(['Attributes'], axis=1), df['Attributes'].apply(pd.Series)], axis=1)
# df

In [185]:
# Load library
from d3blocks import D3Blocks
#
# Initialize
d3 = D3Blocks()
#
# Import example
# df = d3.import_example('stormofswords') # 'stormofswords'
#
# Create force-directed-network (without cluster labels)
d3.d3graph(df, filepath='Elasticgraph.html')
#
# Show elasticgraph
d3.Elasticgraph.show()
# Show original graph with the same properties
# d3.Elasticgraph.D3graph.show()
# #
# # Add cluster labels (no need to do it again because it is the default)
# # d3.Elasticgraph.set_node_properties(color=None)
# #
# # After making changes, show the graph again using show()
# d3.Elasticgraph.show()
# # Show original graph
# d3.Elasticgraph.D3graph.show()
# #
# # Node properties
# d3.Elasticgraph.D3graph.node_properties
# #
# # Node properties
# d3.Elasticgraph.D3graph.edge_properties
#

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Set directed=True to see the markers!
[d3blocks] >INFO> Keep only edges with weight>0
[d3blocks] >INFO> Number of unique nodes: 31
[d3blocks] >INFO> Slider range is set to [14, 98791]
[d3blocks] >INFO> Write to path: [/tmp/tmpygdlmreo/Elasticgraph.html]
[d3blocks] >INFO> File already exists and will be overwritten: [/tmp/tmpygdlmreo/Elasticgraph.html]


AttributeError: 'D3Blocks' object has no attribute 'Elasticgraph'


Returning a DataFrame from Series.apply when the supplied function returns a Series is deprecated and will be removed in a future version.



Unnamed: 0,source,target,weight
0,Canada,United Kingdom,3957
1,Canada,South Africa,16088
2,Canada,Saudi Arabia,98791
3,Canada,United States,14573
4,Canada,Ireland,16826
...,...,...,...
94,Hong Kong,China,17
95,Belgium,Morocco,34084
96,Belgium,France,17787
97,Morocco,France,21197


In [3]:

# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
out1 = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire dans la Syrie."
print(out1)

['The head of the United Nations says there is no military solution in Syria']


In [4]:

# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_ar,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]
)
out2 = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(out2)

['The Secretary-General of the United Nations says there is no military solution in Syria.']
