In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
from tabulate import tabulate
import random, json
import pycountry
from iso639 import languages
import networkx as nx
from dateutil import parser
from datetime import datetime
from collections import Counter
from langdetect import detect
from omnibelt import load_json, save_json

In [2]:
lang_clusters = {'en': ['au', 'ca', 'gb', 'ie', 'in', 'my', 'ng', 'nz', 'ph', 'sa', 'sg', 'us', 'za'],
				 'es': ['ar', 'co', 'cu', 'mx', 've'], 'de': ['at', 'ch', 'de'], 'fr': ['be', 'fr', 'ma'],
				 'zh': ['cn', 'hk', 'tw'], 'ar': ['ae', 'eg'], 'pt': ['br', 'pt'], 'bg': ['bg'], 'cs': ['cz'],
				 'el': ['gr'], 'he': ['il'], 'hu': ['hu'], 'id': ['id'], 'it': ['it'], 'ja': ['jp'], 'ko': ['kr'],
				 'lt': ['lt'], 'lv': ['lv'], 'nl': ['nl'], 'no': ['no'], 'pl': ['pl'], 'ro': ['ro'], 'ru': ['ru'],
				 'sv': ['se'], 'sl': ['si'], 'sk': ['sk'], 'sr': ['rs'], 'th': ['th'], 'tr': ['tr'], 'uk': ['ua']}
loc_names = {'gb': 'United Kingdom', 'ar': 'Argentina', 'pl': 'Poland', 'sk': 'Slovakia', 'us': 'United States',
			 'eg': 'Egypt', 'no': 'Norway', 'ph': 'Philippines', 'at': 'Austria', 'rs': 'Serbia', 'tw': 'Taiwan',
			 'be': 'Belgium', 'cu': 'Cuba', 'sa': 'Saudi Arabia', 'th': 'Thailand', 'id': 'Indonesia',
			 'ru': 'Russian Federation', 'ch': 'Switzerland', 'fr': 'France', 'lt': 'Lithuania', 'tr': 'Turkey',
			 'de': 'Germany', 'cz': 'Czechia', 'pt': 'Portugal', 'ae': 'United Arab Emirates', 'it': 'Italy',
			 'cn': 'China', 'lv': 'Latvia', 'nl': 'Netherlands', 'hk': 'Hong Kong', 'ca': 'Canada', 'br': 'Brazil',
			 'hu': 'Hungary', 'kr': 'Korea', 'si': 'Slovenia', 'au': 'Australia', 'my': 'Malaysia', 'ie': 'Ireland',
			 'ua': 'Ukraine', 'in': 'India', 'ma': 'Morocco', 'bg': 'Bulgaria', 'ng': 'Nigeria', 'il': 'Israel',
			 'se': 'Sweden', 'za': 'South Africa', 've': 'Venezuela', 'nz': 'New Zealand', 'jp': 'Japan',
			 'sg': 'Singapore', 'gr': 'Greece', 'mx': 'Mexico', 'co': 'Colombia', 'ro': 'Romania'}
lang_names = {'en': 'English', 'ko': 'Korean', 'ru': 'Russian', 'es': 'Spanish', 'pt': 'Portuguese', 'cs': 'Czech',
			  'tr': 'Turkish', 'nl': 'Dutch', 'ar': 'Arabic', 'fr': 'French', 'bg': 'Bulgarian', 'id': 'Indonesian',
			  'sk': 'Slovak', 'el': 'Greek', 'he': 'Hebrew', 'sr': 'Serbian', 'hu': 'Hungarian', 'th': 'Thai',
			  'zh': 'Chinese', 'no': 'Norwegian', 'sl': 'Slovenian', 'sv': 'Swedish', 'de': 'German', 'lv': 'Latvian',
			  'pl': 'Polish', 'it': 'Italian', 'ro': 'Romanian', 'lt': 'Lithuanian', 'ja': 'Japanese',
			  'uk': 'Ukrainian'}
cluster_id = {loc: lang for lang, locs in lang_clusters.items() for loc in locs}
len(cluster_id)

54

In [3]:
def show_date(date):
	return date.strftime('%d %b%y')
def get_locs(article):
	return [f'{loc_names[loc]}' for loc in sorted(set(i['location'] for i in article['instances']))]
def get_cats(article):
	return [f'<{cat}>' for cat in sorted(set(i['category'] for i in article['instances']))]
def view_article(art, detailed=False):
	cats = ' '.join(get_cats(art))
	locs = ', '.join(map(repr, get_locs(art)))
	published = parser.parse(art['publishedAt'])
	collected = [parser.parse(i['collectedAt']) for i in art['instances']]
	first = min(collected)
	last = max(collected)
	timing = f'{show_date(first)}' if first == last else f'{show_date(first)} - {show_date(last)}'

	lines = []
	if 'en-title' in art:
		lines.append(f'English Title: {art["en-title"]!r}')
	lines.append(f'{lang_names[art["language"]]} Title: {art["title"]!r}')
	lines.append(f'Categories: {cats}  ---   {locs} ({timing})')

	if detailed:
		added_desc = False
		desc = art.get('description')
		en_desc = art.get('en-description')
		if en_desc is not None and len(en_desc):
			lines.append(f'English Description: {en_desc}')
			added_desc = True
		if desc is not None and len(desc):
			lines.append(f'{lang_names[art["language"]]} Description: {desc}')
			added_desc = True
		if not added_desc:
			lines.append(f'- No description -')
	print('\n'.join(lines))

In [4]:
root = Path('/home/fleeb/workspace/local_data/nnn')
recs = (root / 'old-bb-v1').glob('**/*.json')
recs = (root / 'babel-briefings-v1').glob('**/*.json')
recs = list(recs)
len(recs)
articles = []
for rec in tqdm(recs):
	articles.extend(load_json(rec))
len(articles)

  0%|          | 0/54 [00:00<?, ?it/s]

4719199

In [5]:
by_loc = {}
by_lang = {}
for article in tqdm(articles):
	# article['published'] = parser.parse(article['publishedAt'])
	by_lang.setdefault(article['language'], []).append(article)
	for instance in article['instances']:
		# instance['collected'] = parser.parse(instance['collectedAt'])
		by_loc.setdefault(instance['location'], []).append(article)
len(by_loc), len(by_lang)

  0%|          | 0/4719199 [00:00<?, ?it/s]

(54, 30)

In [83]:
from wordcloud import WordCloud
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from rake_nltk import Rake
import spacy
# from sklearn.cluster import DBSCAN
# import numpy as np

nlp = spacy.load('en_core_web_md')
# rake_nltk_var = Rake()
# lemmatizer = WordNetLemmatizer()

In [None]:
noun_phrases = []
for headline in headlines:
    doc = nlp(headline)
    for p in doc.noun_chunks:
        noun_phrases.append(str(p))

# Convert noun phrases to vectors
vectors = [nlp(p).vector for p in noun_phrases]

# Apply DBSCAN clustering
clustering = DBSCAN(eps=0.5, min_samples=1, metric='cosine').fit(np.array(vectors))
labels = clustering.labels_

# Group noun phrases based on clustering results
groups = {}
for np, label in zip(noun_phrases, labels):
    if label not in groups:
        groups[label] = []
    groups[label].append(np)

# Print the grouped noun phrases
for label, nps in groups.items():
    print(f"Group {label}: {nps}")

In [92]:
batch = by_lang['de']
len(batch)

259718

In [93]:
art = random.choice(batch)
view_article(art, detailed=True)
print(art['source-name'])

English Title: 'iOS 14.2 is here: This change will make you happy - inside digital'
German Title: 'iOS 14.2 ist da: Diese Änderung wird dich glücklich machen - inside digital'
Categories: <technology>  ---   'Germany' (07 Nov20)
English Description: It is now possible to use iPhones with iOS 14.2. We'll tell you what's changing with the new version. It's a lot.
German Description: Ab sofort ist es möglich, iPhones mit iOS 14.2 zu nutzen. Wir verraten dir, was sich mit der neuen Version alles ändert. Es ist einiges.
inside digital


In [95]:
sources = {}
for art in batch:
	sources.setdefault(art['source-name'], []).append(art)
len(sources)

1437

In [119]:
src, arts = random.choice(list(sources.items()))
src, len(arts)

('Www.otz.de', 5)

In [120]:
segs = [a.get('en-title', art['title']).split(' - ') for a in arts]
segs

[['Ex-basketball player Dirk Nowitzki extends contract with Thüringer Bauerfeind AG',
  'Ostthüringer Zeitung'],
 ['No stadium solution for Viktoria Berlin: FC Carl Zeiss Jena as a beneficiary?',
  'Ostthüringer Zeitung'],
 ['Thuringian State Observatory measures over 8,000 potential threats to the Earth',
  'Ostthüringer Zeitung'],
 ['Participants from Jena wanted for study on corona consequences',
  'Thuringian state newspaper'],
 ['Greiz district: Introduced viruses are becoming more common',
  'Ostthüringer Zeitung']]

In [103]:
endterms = [x.get('en-title', art['title']).split(' - ')[-1] for x in xs]
endterms

['ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'Gamereactor Deutschland',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at',
 'ORF.at']

In [None]:
def preprocess_title(art):
	
	title = art.get('en-title', art['title'])
	
	
	
	pass

In [56]:
def to_prompt(art):
	title = art.get('en-title', art['title'])
	# rake_nltk_var.extract_keywords_from_text(title)
	desc = art.get('en-description', art['description'])
	if desc is not None and len(desc):
		pass
		# rake_nltk_var.extract_keywords_from_text(desc)
	else:
		desc = ''
	content = art.get('en-content', art['content'])
	if content is not None and len(content):
		pass
		# rake_nltk_var.extract_keywords_from_text(content)
	else:
		content = ''
	
	text = f'{title}\n{desc}\n{content}'
	return text

In [84]:
text = to_prompt(art)

In [85]:
doc = nlp(text)

In [90]:
lemmatized_tokens = [token.lemma_.lower() for token in doc if str(token).lower() not in stopwords.words('english')]
print(lemmatized_tokens)

['argentina', "'s", 'nuclear', 'power', 'plant', 'achieve', 'record', 'electricity', 'production', '-', 'fact', '.', 'bg', '\n', 'company', 'operator', 'argentine', 'nuclear', 'power', 'plant', 'nucleoéléctrica', 'argentina', '..', '\n', 'nuclear', 'power', 'argentina', '2020', ',', 'grupo', 'la', 'provincia', '.', '„', '"', '1', '2', '„', '"', '7,947', '1', '-', '30', '.', ',', '2020', '.', ',', ',', '5,059', '.', '„', ',', '"', ',', '.']


In [91]:
list(doc.noun_chunks)

[Argentina's nuclear power plants,
 a record,
 electricity production - Facts,
 BG
 The company operator,
 the Argentine nuclear power plants,
 Nucleoéléctrica Argentina,
 Nuclear Power Argentina,
 Grupo La Provincia]

In [None]:
doc.no

In [77]:

text = to_prompt(art)
# rake_nltk_var.extract_keywords_from_text(text)
# keywords = rake_nltk_var.get_ranked_phrases()

tokens = word_tokenize(text)
tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]

# wordcloud = WordCloud(width = 800, height = 800, background_color ='white', min_font_size = 10).generate(' '.join(tokens))
# keywords = wordcloud.words_

# # tokens = word_tokenize(text.lower())  # Lowercasing
# tokens = word_tokenize(text)
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

tags = pos_tag(lemmatized_tokens)
keywords = [word for word, pos in tags if pos in ['NN', 'NNP']]

keyword_freq = Counter(keywords)
# print(f'Input: {title!r}')
print(text)
print(tabulate(keyword_freq.most_common()))

Argentina's nuclear power plants achieved a record in electricity production - Facts.BG
The company operator of the Argentine nuclear power plants Nucleoéléctrica Argentina ..
Nuclear Power Argentina 2020, Grupo La Provincia. „“ 1 2 „“ 7,947 1 - 30 . , 2020 . , , 5,059 . „, “, .
---------------  -
Argentina        3
power            2
plant            2
„                2
record           1
electricity      1
production       1
Facts.BG         1
company          1
operator         1
Argentine        1
Nucleoéléctrica  1
..               1
Nuclear          1
Power            1
Grupo            1
La               1
Provincia        1
“                1
---------------  -


In [26]:
rake_nltk_var.extract_keywords_from_text(art['source-name'])
rake_nltk_var.get_ranked_phrases()

['today']

In [30]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

sentence = "Apple is looking at buying U.K. startup for $1 billion"
tokens = word_tokenize(sentence)
tags = pos_tag(tokens)
tree = ne_chunk(tags)
keywords = [leaf for leaf in tree if isinstance(leaf, nltk.Tree)]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fleeb/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/fleeb/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/fleeb/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [31]:
keywords

[Tree('GPE', [('Apple', 'NNP')])]

In [34]:
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "This is a sample sentence, showing off the stop words filtration."
tokens = word_tokenize(text)
fdist = FreqDist(tokens)
keywords = [word for word, count in fdist.items() if count > 1]


[nltk_data] Downloading package punkt to /home/fleeb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
keywords

[]

In [38]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('averaged_perceptron_tagger')

text = "The quick brown fox jumps over the lazy dog"
tokens = word_tokenize(text)
tags = pos_tag(tokens)
keywords = [word for word, pos in tags if pos in ['NN', 'NNP']]
keywords

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/fleeb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


['brown', 'fox', 'dog']

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package wordnet to /home/fleeb/nltk_data...
[nltk_data] Downloading package punkt to /home/fleeb/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
from rake_nltk import Rake

rake_nltk_var = Rake()
text = "This is a sample sentence, showing off the stop words filtration."
rake_nltk_var.extract_keywords_from_text(text)
keyword_ranked_phrases = rake_nltk_var.get_ranked_phrases()


In [13]:
keyword_ranked_phrases

['stop words filtration', 'sample sentence', 'showing']

In [63]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample headlines
headlines = [
    "Climate Change: A Global Challenge",
    "Tech Giants Face Antitrust Lawsuits",
    "Stock Market Hits Record High"
]

# Preprocess text
preprocessed_headlines = [headline.lower() for headline in headlines]

# Create TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_df=0.85, max_features=10, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(preprocessed_headlines)

# Extract keywords
feature_names = vectorizer.get_feature_names_out()
keywords = [feature_names[idx] for idx in tfidf_matrix.sum(axis=0).argsort()[0, ::-1]]
print(keywords)


[array([['hits', 'high', 'lawsuits', 'global', 'giants', 'face',
        'climate', 'change', 'challenge', 'antitrust']], dtype=object)]


In [64]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [69]:
doc = nlp(text)
# lemmatizer = nlp.add_pipe("lemmatizer")
# This usually happens under the hood
# processed = lemmatizer(doc)
# processed
doc

Argentina's nuclear power plants achieved a record in electricity production - Facts.BG
The company operator of the Argentine nuclear power plants Nucleoéléctrica Argentina ..
Nuclear Power Argentina 2020, Grupo La Provincia. „“ 1 2 „“ 7,947 1 - 30 . , 2020 . , , 5,059 . „, “, .

In [76]:
lemmatized_tokens = [token.lemma_.lower() for token in doc if str(token).lower() not in stopwords.words('english')]
lemmatized_tokens

['argentina',
 "'s",
 'nuclear',
 'power',
 'plant',
 'achieve',
 'record',
 'electricity',
 'production',
 '-',
 'fact',
 '.',
 'bg',
 '\n',
 'company',
 'operator',
 'argentine',
 'nuclear',
 'power',
 'plant',
 'nucleoéléctrica',
 'argentina',
 '..',
 '\n',
 'nuclear',
 'power',
 'argentina',
 '2020',
 ',',
 'grupo',
 'la',
 'provincia',
 '.',
 '„',
 '"',
 '1',
 '2',
 '„',
 '"',
 '7,947',
 '1',
 '-',
 '30',
 '.',
 ',',
 '2020',
 '.',
 ',',
 ',',
 '5,059',
 '.',
 '„',
 ',',
 '"',
 ',',
 '.']

In [74]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()



'Argentina'

In [8]:

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Example text
text = "Barack Obama, President Obama, and Obama are all referring to the same person."

# Tokenize and lemmatize
tokens = word_tokenize(text.lower())  # Lowercasing
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

# Now 'Barack Obama', 'President Obama', and 'Obama' can be processed further for similarity matching or synonym mapping


In [9]:
lemmatized_tokens

['barack',
 'obama',
 ',',
 'president',
 'obama',
 ',',
 'and',
 'obama',
 'are',
 'all',
 'referring',
 'to',
 'the',
 'same',
 'person',
 '.']

In [82]:
import spacy
from sklearn.cluster import DBSCAN
import numpy as np

# Load SpaCy NLP model
nlp = spacy.load('en_core_web_md')
# nlp = spacy.load('en_core_web_sm')

# Example dataset of news headlines
headlines = [
    "Apple announces new iPhone",
    "Microsoft releases Windows 11",
    "New iPhone receives positive reviews",
    "Windows 11 has new features",
    "Apple iPhone sales skyrocket",
]

# Preprocess headlines and extract noun phrases
noun_phrases = []
for headline in headlines:
    doc = nlp(headline)
    for p in doc.noun_chunks:
        noun_phrases.append(str(p))

# Convert noun phrases to vectors
vectors = [nlp(p).vector for p in noun_phrases]

# Apply DBSCAN clustering
clustering = DBSCAN(eps=0.5, min_samples=1, metric='cosine').fit(np.array(vectors))
labels = clustering.labels_

# Group noun phrases based on clustering results
groups = {}
for np, label in zip(noun_phrases, labels):
    if label not in groups:
        groups[label] = []
    groups[label].append(np)

# Print the grouped noun phrases
for label, nps in groups.items():
    print(f"Group {label}: {nps}")


Group 0: ['Apple', 'new iPhone', 'Microsoft', 'Windows', 'Windows', 'new features', 'Apple iPhone sales skyrocket']
Group 1: ['New iPhone']
Group 2: ['positive reviews']
