In [1]:
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs
# %run "../gkg_tools.py" # using magic command run to access the script from the parent directory


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# # GPU Timing (using GPU 1) else -1 for CPU
# device_id = 1 if torch.cuda.is_available() else -1 

here


# Revised GKG Data Query
- When using `gkg.get_gkg()` the input can be a stored gkg table for the parameter `data=dataframe_name`.
- This avoids the possibility of storing two active gkg tables.
- `coverage=true` only applies to queries from the gdelt database.

## Using a saved query

In [4]:
gkg = gkg_operator() # create a gkg operator
# OP = pd.read_csv('OP.csv')
manga = pd.read_csv('manga.csv')
gkg.get_gkg(data=manga) # stores in gkg.gkg_query as a dataframe
# gkg.get_gkg(data=OP) # stores in gkg.gkg_query as a dataframe
gkg.gkg_query.shape

(355, 27)

In [5]:
gkg.parse_urls() # parse urls in the gkg_query

In [6]:
# gkg.parse_gkg_field('amounts')
# gkg.parse_gkg_field('persons')
gkg.parse_gkg_field('v2persons')
# gkg.parse_gkg_field('themes')
# gkg.parse_gkg_field('v2tone')

Unnamed: 0,index,v2persons_0,v2persons_1
0,0,Yokinobu Tatsu,394
1,1,Eiichiro Oda,7202
2,2,Hideo Kojima,181
3,2,Hideo Kojima,1123
4,2,Hideo Kojima,1492
...,...,...,...
2889,353,Kaori Maeda,4210
2890,353,Miyu Tomita,4224
2891,354,King Harald,1524
2892,354,Nico Robin,6086


In [7]:
gkg.tokenize_field(col_idx=1) # when calling this function, the tokenized field is given by col_idx input parameter
gkg.field_tokens[:5] + gkg.field_tokens[-5:]

['Aaron Dismuke Fullmetal',
 'Aaron Schimberg',
 'Abare Hanagumi',
 'Abby Trott',
 'Abhilash Dominic',
 'Zeno Robinson',
 'Zhang Chulan',
 'Zhenhua Gu',
 'Zuzazazazaza Rainbow',
 'Zyuden Sentai Kyoryuger']

## Vectorize Field from GKG
The vectorizer currently keeps a simple count of the field items associated with a given document to the list off all unique field items in the currently set `gkg.gkg_query`. The document row will get a 1 if the item is present, and 0 if missing for a given document.

Note that in the gkg v2 fields such as `v2persons` a name is recorded multiple times if it was observed in multiple positions within the document. We could weight our document vectors for entries which occur more often. For instance, suppose a writer is mentioned with reference to a work but the article is primarily about a character within the work. Summing the number of occurences would give more weight to the character and also provide more context about the deeper content of the article.

### Placeholder Weighting Mechanism:
The function `np.log1p(token_count)` applies a logarithmic scaling to each token count. Specifically, `log1p(x)` calculates $log(1 + x)$, which is often used to scale counts in a way that reduces the impact of large values (e.g., frequent tokens) while retaining the order of magnitude.

### Logarithmic Scaling as Weighting:
Logarithmic scaling is a common approach to handle count data because it "dampens" high counts, preventing them from disproportionately influencing results.

For instance, if a token appears 100 times in a document, `np.log1p(100)` scales it down to a lower value (around 4.615), which can be more manageable and informative than the raw count. This scaling is especially helpful when comparing frequencies across tokens or documents with varied token counts.

In [8]:
# gkg.vectorize_field() # default weighting is 'uniform'
gkg.vectorize_field(weight='weighted') # logaritmic weighting
# gkg.vectorized_df.columns = gkg.tokenize_field(col_idx=2) # resets columns to uncleaned field names
gkg.vectorized_df.head(2)
# print(gkg.vectorized_df.shape)

  aligned_tokens = aligned_tokens.applymap(get_weight)  # Apply weighting function to each token count


Unnamed: 0,aaron dismuke fullmetal,aaron schimberg,abare hanagumi,abby trott,abhilash dominic,abu dhabi,adam gibbs,adam lovell,adam mcarthur,adam pearson,...,yuzuru shimazaki ginga tetsudo,zach aguilar,zaraki kenpachi,zeeshan khan,zenitsu agatsuma,zeno robinson,zhang chulan,zhenhua gu,zuzazazazaza rainbow,zyuden sentai kyoryuger
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
### Get Vectorized Field Stats
# gkg.get_fields_stats() # get the stats for the vectorized field
gkg.get_fields_stats(weight='weighted') # get the stats for the vectorized field

Top 10 Tokens by Non-Zero Percentage
                Non-Zero Percentage
eiichiro oda               8.732394
akira toriyama             5.352113
saka mikami                3.943662
cocoa fujiwara             3.661972
ayane sakura               3.380282
kaito ishikawa             3.098592
izuku midoriya             2.816901
kazuya nakai               2.816901
nana mizuki                2.253521
momo ayase                 2.253521

Top 10 Tokens by Mean Weighted Value
                      Mean Weighted Value
anura kumara                     3.871201
zeno robinson                    3.806662
yuji kaku                        3.135494
mateus manhanini                 3.135494
yuji kaku creees lee             2.564949
scott snyder                     2.302585
nick dragotta                    2.197225
azami kurotani                   2.197225
julie chung                      2.079442
trina nishimura                  2.063567


In [16]:
gkg.vectorized_df.sum(axis=0).sort_values(ascending=False).head(5)

eiichiro oda      25.188865
akira toriyama    20.795544
cocoa fujiwara    14.281960
saka mikami        9.704061
ayane sakura       8.317766
dtype: float64

In [11]:
# get column eiichiro oda with non-zero values
eiichiro = gkg.vectorized_df['eiichiro oda'][gkg.vectorized_df['eiichiro oda'] > 0]
eiichiro.max()

2.0794415416798357

In [12]:
gkg.parsed_urls[:5]

['dan da dan first chapters free 2000507810',
 'one piece oda reveals legendary god elbaf',
 'dandadan anime hideo kojima',
 'my hero academia season 7 finale',
 'uzumaki anime sound']

In [13]:
gkgvf = gkg.vectorized_df.copy()
# rename rows with urls
gkgvf.index = gkg.parsed_urls
gkgvf.head(2)


Unnamed: 0,aaron dismuke fullmetal,aaron schimberg,abare hanagumi,abby trott,abhilash dominic,abu dhabi,adam gibbs,adam lovell,adam mcarthur,adam pearson,...,yuzuru shimazaki ginga tetsudo,zach aguilar,zaraki kenpachi,zeeshan khan,zenitsu agatsuma,zeno robinson,zhang chulan,zhenhua gu,zuzazazazaza rainbow,zyuden sentai kyoryuger
dan da dan first chapters free 2000507810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
one piece oda reveals legendary god elbaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
gkg.get_all_soup(limit_output=5,verbose=True)
# gkg.get_all_soup(limit_output='all')

No text bounds set for this URL: https://gizmodo.com/dan-da-dan-first-chapters-free-2000507810
No text bounds set for this URL: https://comicbook.com/anime/news/dandadan-anime-hideo-kojima/
No text bounds set for this URL: https://comicbook.com/anime/news/my-hero-academia-season-7-finale/
No text bounds set for this URL: https://collider.com/uzumaki-anime-sound/
No text bounds set for this URL: https://movieweb.com/best-animated-series-based-on-mythology/
Error: 403 Client Error: Forbidden for url: https://www.fandompost.com/2024/10/05/yen-press-schedules-1st-april-showers-bring-may-flowers-manga-print-digital-releases-from-roku-sakura/
No text bounds set for this URL: https://www.fandompost.com/2024/10/05/yen-press-schedules-1st-april-showers-bring-may-flowers-manga-print-digital-releases-from-roku-sakura/
Error: 403 Client Error: Forbidden for url: https://www.fandompost.com/2024/10/05/i-may-be-a-guild-receptionist-anime-debuts-jade-scrade-character-trailer/
No text bounds set for th

No text bounds set for this URL: https://www.animenewsnetwork.com/bbs/phpBB2/viewtopic.php?t=3202688
No text bounds set for this URL: https://www.animenewsnetwork.com/bbs/phpBB2/viewtopic.php?t=3202689
No text bounds set for this URL: https://comicbook.com/anime/news/my-hero-academia-anime/
No text bounds set for this URL: https://www.animenewsnetwork.com/bbs/phpBB2/viewtopic.php?t=3202687
No text bounds set for this URL: https://www.animenewsnetwork.com/news/2024-10-12/live-action-wingman-series-reveals-3-more-cast-members-new-trailer/.216673
No text bounds set for this URL: https://www.animenewsnetwork.com/news/2024-10-13/gto-series-gets-new-uchiyamada-spinoff-manga/.216682
No text bounds set for this URL: https://www.animenewsnetwork.com/bbs/phpBB2/viewtopic.php?t=3202686
No text bounds set for this URL: https://www.tomsguide.com/entertainment/streaming/netflixs-new-ranma-1-2-reboot-brings-back-this-beloved-gender-swapping-classic-anime
No text bounds set for this URL: https://www.a

In [None]:
asd = gkg.all_soup_data
asd.info()
# gkg.all_soup_data['Paragraphs'][0]

In [None]:
asd.to_csv('manga_soup.csv')

In [16]:
asd['Paragraphs'][0][:3]

["This article contains spoilers from One Piece's Elbaf Arc.",
 'One Piecechapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent withthe reveal of Loki so early into the arc. Truly, few were expecting Loki to be revealed in such fashion inOne Piecechapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be.',
 'Loki is perhaps the most powerful giant. His strength might be on par with a Yonko in One Piece.']

In [17]:
for item in asd['Paragraphs'][0]:
    idx = asd['Paragraphs'][0].index(item) + 1
    print(f'Paragraph {idx}: {item}')

Paragraph 1: This article contains spoilers from One Piece's Elbaf Arc.
Paragraph 2: One Piecechapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent withthe reveal of Loki so early into the arc. Truly, few were expecting Loki to be revealed in such fashion inOne Piecechapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be.
Paragraph 3: Loki is perhaps the most powerful giant. His strength might be on par with a Yonko in One Piece.
Paragraph 4: In this chapter, fans even got what can be considered to be a buildup of sorts towards a fight between Luffy, who is the Sun God Nika, and Loki, who also believes himself to be the Sun God. It looks like there is going to be a clash of Sun Gods afterOne Piecechapter 1130 and it is certainly going to be one worth watching, provided Loki does break free from his chains.
Paragraph 5: Luffy and Loki m