In [1]:
import os
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs
# %run "../gkg_tools.py" # using magic command run to access the script from the parent directory


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# GPU Timing (using GPU 1) else -1 for CPU
device_id = 1 if torch.cuda.is_available() else -1 

here


In [2]:
gkg = gkg_operator() # create a gkg operator
# gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
# gkg.set_date(['2024 Oct 30']) # set the date range for the search
gkg.get_gkg(coverage=True)
persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
OP = gkg.gkg_persons(persons)
OP.reset_index(inplace=True)

# url = OP['documentidentifier'].iloc[0]
# url

In [3]:
gkg.gkg_query.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31349 entries, 0 to 31348
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   gkgrecordid                 31349 non-null  object 
 1   date                        31349 non-null  int64  
 2   sourcecollectionidentifier  31349 non-null  int64  
 3   sourcecommonname            31349 non-null  object 
 4   documentidentifier          31349 non-null  object 
 5   counts                      3798 non-null   object 
 6   v2counts                    3798 non-null   object 
 7   themes                      27818 non-null  object 
 8   v2themes                    27818 non-null  object 
 9   locations                   24251 non-null  object 
 10  v2locations                 24221 non-null  object 
 11  persons                     24707 non-null  object 
 12  v2persons                   24485 non-null  object 
 13  organizations               215

In [4]:
# store OP in a csv file
# OP.to_csv('OP.csv', index=False)
# OP = pd.read_csv('OP.csv')
# urls = OP['documentidentifier'].tolist()

nrows = 10000
urls = gkg.gkg_query['documentidentifier'].tolist()[:nrows]
parsed_urls = []
for url in urls:
    url = url.replace('https://','').replace('http://','').replace('www.','').split('/')
    # if url is empty string, remove it
    if url[-1] == '':
        url.pop()
    article_title = url[-1]
    article_title = article_title.replace('-',' ')
    article_title = article_title.replace('_',' ')
    url[-1] = article_title
    # keep only first and last element of url
    url = [url[0],url[-1]]
    print(url)
    parsed_urls.append(article_title)

# urls 

['stokesentinel.co.uk', 'hotel issues statement out control 9683785']
['lithgowmercury.com.au', '?cs=34616']
['sconeadvocate.com.au', '?cs=305']
['wknofm.org', 'russian exiles push western countries to support ukraine']
['classy100.com', 'rep waltz to newsmax biden wh sends weapons doesnt contain iran']
['arxiv.org', '2111.08566']
['yasstribune.com.au', '?cs=34616']
['echo-news.co.uk', '24696787.russell brand sex offence allegations evidence sent cps']
['watoday.com.au', 'scientific soothsayers why us election polls are often right or spectacularly wrong 20241031 p5kmtx.html']
['yahoo.com', 'donald trump holds rally gastonia 220710886.html']
['wkyt.com', 'kentucky early voter turnout breaks record second day row']
['953wdae.iheart.com', '2024 11 02 jets vs lightning time tv live stream 1132024']
['en.apa.az', 'israeli commandos nab top hezbollah naval operative in north lebanon raid 452362']
['dailykos.com', ' Cartoon Setting back the clock']
['gloucesteradvocate.com.au', '?cs=34616']


In [7]:
amounts = OP['amounts'].iloc[3].split(';') # For reference check the GDELT GKG Cookbook V2.1
print(amounts[0])
amt0 = amounts[0].split(',') # 3 components: amount, object, offset
print(f'The amount is {amt0[0]} with object "{amt0[1]}" at approximate character offset {amt0[2]} of the document.')

IndexError: single positional indexer is out-of-bounds

# GKG Field Parser
Specifics for fields are in progress. Currently all the fields below work, as well as the version one types. The GKG Cookbook can be used for further field refinements within the `gkg_operator` class.

In [8]:
gkg.gkg_query.shape

(31349, 27)

In [9]:
# gkg.parse_gkg_field(OP, 'amounts')
# gkg.parse_gkg_field(OP, 'v2persons')
# gkg.parse_gkg_field(OP, 'v2themes')
# gkg.parse_gkg_field(OP, 'allnames')
# gkg.parse_gkg_field(OP, 'v2locations') #  (semicolon-delimited blocks, with pound symbol (“#”) delimited fields) 
# gkg.parse_gkg_field(OP, 'v2tone')
# gkg.parse_gkg_field(field='locations')
gkg.parse_gkg_field(data=gkg.gkg_query.iloc[:nrows,:],field='organizations')

# gkg.parse_gkg_field(data=gkg.gkg_query.iloc[:nrows,:],field='themes')

Unnamed: 0,index,organizations_0
0,0,foxley hotel
1,0,foxley hotel in milton
2,1,nissan
3,1,toyota
4,1,isuzu
...,...,...
18124,9992,flinders chase national park
18125,9993,darebin arts center
18126,9995,supreme court
18127,9997,san diego jewish academy


In [10]:
job_op_idx = gkg.parsed_fields_df[gkg.parsed_fields_df['themes_0'].str.contains('JOB_OPPORT')]['index'].unique().tolist()
recent_jobs = gkg.gkg_query.iloc[job_op_idx,:]
recent_jobs.head()
# gkg.parsed_fields_df[gkg.parsed_fields_df['index'] == 0] # the last field is the charoffset

KeyError: 'themes_0'

# GKG Field Tokenization

In [11]:
gkg.tokenize_field(col_idx=1)
print(gkg.field_tokens[:5] + gkg.field_tokens[-5:])
print(len(gkg.field_tokens))

['a bellingham city club', 'a conservative party', 'a court', 'a decade of building great technology', 'a department of health', 'zoe church', 'zoning commission', 'zoological society of london', 'zoominfo technologies inc', 'zoram people movement']
7003


# GKG Field Vectorizer

In [12]:
parsed_urls[:10]

['hotel issues statement out control 9683785',
 '?cs=34616',
 '?cs=305',
 'russian exiles push western countries to support ukraine',
 'rep waltz to newsmax biden wh sends weapons doesnt contain iran',
 '2111.08566',
 '?cs=34616',
 '24696787.russell brand sex offence allegations evidence sent cps',
 'scientific soothsayers why us election polls are often right or spectacularly wrong 20241031 p5kmtx.html',
 'donald trump holds rally gastonia 220710886.html']

In [13]:
# gkgvf = gkg.vectorize_field(data=OP)
gkgvf = gkg.vectorize_field(data=gkg.gkg_query.iloc[:nrows,:])
# gkgvf = gkg.vectorize_field(data=recent_jobs)
gkgvf.shape
# rename rows with urls
gkgvf.index = parsed_urls
gkgvf.head()


1 is the field object column index


Unnamed: 0,a bellingham city club,a conservative party,a court,a decade of building great technology,a department of health,a des moines register mediacom selzer co,a disaster recovery center,a division,a finance department,a fragmented community,...,zifa matabeleland south division two league,ziff davis,zimbabwe confederation of public sector trade unions,zimbabwe republic police,zion national park,zoe church,zoning commission,zoological society of london,zoominfo technologies inc,zoram people movement
hotel issues statement out control 9683785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
?cs=34616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
?cs=305,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
russian exiles push western countries to support ukraine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rep waltz to newsmax biden wh sends weapons doesnt contain iran,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
gkg.vectorized_df.sum(axis=0).sort_values(ascending=False).head(20)

united states                         546.0
associated press                      436.0
twitter                               341.0
white house                           282.0
facebook                              282.0
new york times                        246.0
instagram                             221.0
right center                          169.0
detroit lions                         152.0
atlanta civic center                  130.0
cnn                                   128.0
youtube                               107.0
supreme court                         103.0
espn                                   89.0
national planning policy framework     87.0
prologis                               87.0
laboratory of molecular biology        86.0
foreign office                         82.0
drug administration                    80.0
national institutes of health          79.0
dtype: float64

In [15]:

# Desired column names
target_columns = ['one piece', 'sun god', 'devil fruit','one piece fan letter', 'egghead island',
                   'artificial intelligence', 'microsoft','msft','nuclear', 'ge vernova', 'data science', 'python','ai', 'data']
# test business organizations
target_columns = ['microsoft', 'microsoft corporation', 'microsoft corp', 'msft',
                  'ibm', 'international business machines', 'ibm corp', 'ibm corporation',
                  'apple', 'apple inc', 'apple corporation',
                  'google', 'google inc', 'google corporation',]

# test political parties
target_columns = ['supreme court', 'democratic party', 'republican party', 'green party', 'libertarian party',]

# Get the intersection of target columns and actual columns in the DataFrame
existing_columns = gkg.vectorized_df.columns.intersection(target_columns)

# Subset the DataFrame to only include the existing columns
result_df = gkg.vectorized_df[existing_columns].copy()
# only show non-zero records
result_df = result_df[result_df.sum(axis=1) > 0]
result_df
result_df[result_df['green party'] > 0].head()

Unnamed: 0,democratic party,green party,republican party,supreme court
harris widens lead trump virginia 204331061.html,0.0,1.0,0.0,0.0
harris widens lead trump virginia 204313786.html,0.0,1.0,0.0,0.0
ae0a2d7d 6713 4864 a3f4 ce2f55fe05f6,0.0,1.0,0.0,0.0
article 827278,0.0,1.0,0.0,0.0
undecided voters us election 2024 kamala harris c9dmr7mjg,1.0,1.0,0.0,0.0


In [16]:
# what columns are non-zero for the rows missing the 'one piece' token
# gkg.vectorized_df[gkg.vectorized_df['one piece'] == 0].iloc[1].value_counts()
# gkg.vectorized_df[gkg.vectorized_df['one piece'] == 0].sum(axis=0).sort_values(ascending=False).head(20)

# GKG Tools Get Soup

In [17]:
gkg.get_gkg_soup(urls[6])

In [18]:
for url in urls[:10]:
    print(urls.index(url))
    gkg.parse_gkg_soup(url,verbose=True)

0
Title: Hotel issues statement as out-of-control bonfire spreads to caravan - Stoke-on-Trent Live
No text bounds set for this URL
Length of Header List: 3
First Header: Hotel issues statement as out-of-control bonfire spreads to caravan
Last Header: Follow usFollow us
Length of Paragraph List: 11
First Paragraph: Want Stoke-on-Trent news emailed to you direct from our journalists? Sign up to our newsletter
Last Paragraph: Get daily headlines and breaking news emailed to you - it’s FREE


1
Title: Which off-road ready ute would you buy? | Lithgow Mercury | Lithgow, NSW
No text bounds set for this URL
Length of Header List: 20
First Header: Which off-road ready ute would you buy?
Last Header: William Stopford: Ford Ranger Raptor
Length of Paragraph List: 67
First Paragraph: Your digital subscription includes access to content from all our websites in your region.
Access unlimited content and the digital versions of our print editions - Today's Paper.
Last Paragraph: Advertisement


2
Ti

# GKGSoup

In [19]:
from gkg_soup import GKGSoup
gkgsoup = GKGSoup()
for url in urls[:4]:
    # gkg.parse_gkg_soup(url,verbose=True)
    gkgsoup.parse_gkg_soup(url,verbose=True)

Title: Hotel issues statement as out-of-control bonfire spreads to caravan - Stoke-on-Trent Live
No text bounds set for this URL
Length of Header List: 3
First Header: Hotel issues statement as out-of-control bonfire spreads to caravan
Last Header: Follow usFollow us
Length of Paragraph List: 11
First Paragraph: Want Stoke-on-Trent news emailed to you direct from our journalists? Sign up to our newsletter
Last Paragraph: Get daily headlines and breaking news emailed to you - it’s FREE


Title: Which off-road ready ute would you buy? | Lithgow Mercury | Lithgow, NSW
No text bounds set for this URL
Length of Header List: 20
First Header: Which off-road ready ute would you buy?
Last Header: William Stopford: Ford Ranger Raptor
Length of Paragraph List: 67
First Paragraph: Your digital subscription includes access to content from all our websites in your region.
Access unlimited content and the digital versions of our print editions - Today's Paper.
Last Paragraph: Advertisement


Title: T