In [2]:
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs
# %run "../gkg_tools.py" # using magic command run to access the script from the parent directory


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# # GPU Timing (using GPU 1) else -1 for CPU
device_id = 1 if torch.cuda.is_available() else -1 

# Revised GKG Data Query
- When using `gkg.get_gkg()` the input can be a stored gkg table for the parameter `data=dataframe_name`.
- This avoids the possibility of storing two active gkg tables.
- `coverage=true` only applies to queries from the gdelt database.

## Using a gdelt query

In [3]:
gkg = gkg_operator() # create a gkg operator


In [4]:
gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
gkg.get_gkg(coverage=True)




In [5]:
# # persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
src_name = ['gamerant.com','screenrant.com']
rant = gkg.gkg_query[gkg.gkg_query['sourcecommonname'].isin(src_name)]
# rant.info()
# search rant themes for the regex pattern MANGA, count entries
manga = rant[rant['themes'].str.contains('MANGA') == True]
manga.info()
# OP = gkg.gkg_persons(persons)
# OP.reset_index(inplace=True)
# store OP in a csv file
# OP.to_csv('OP.csv', index=False)
# OP = pd.read_csv('OP.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 33 entries, 3132 to 208244
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   gkgrecordid                 33 non-null     object 
 1   date                        33 non-null     int64  
 2   sourcecollectionidentifier  33 non-null     int64  
 3   sourcecommonname            33 non-null     object 
 4   documentidentifier          33 non-null     object 
 5   counts                      6 non-null      object 
 6   v2counts                    6 non-null      object 
 7   themes                      33 non-null     object 
 8   v2themes                    33 non-null     object 
 9   locations                   14 non-null     object 
 10  v2locations                 14 non-null     object 
 11  persons                     33 non-null     object 
 12  v2persons                   33 non-null     object 
 13  organizations               26 non-

## Using a saved table

In [41]:
OP = pd.read_csv('OP.csv') # We could alternatively use the subset of the queried data in the last section.
# gkg.get_gkg(data=OP) # stores in gkg.gkg_query as a dataframe
# there is an indexing mismatch somewhere, the data needs the index reset
gkg.get_gkg(data=rant.iloc[:500]) # stores in gkg.gkg_query as a dataframe
# gkg.get_gkg(data=manga)
gkg.gkg_query.head(2)

Unnamed: 0,gkgrecordid,date,sourcecollectionidentifier,sourcecommonname,documentidentifier,counts,v2counts,themes,v2themes,locations,...,gcam,sharingimage,relatedimages,socialimageembeds,socialvideoembeds,quotations,allnames,amounts,translationinfo,extras
0,20241020000000-504,20241020000000,1,gamerant.com,https://gamerant.com/the-sims-4-red-dead-redem...,,,,,,...,"wc:636,c1.1:2,c12.1:59,c12.10:82,c12.12:36,c12...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,2019|32||once I do some finishing touches,"Red Dead Redemption,70;Red Dead Redemption,311...","4,player recreated Red Dead,26;2,Rhodes town w...",,<PAGE_LINKS>https://gamerant.com/db/video-game...
1,20241020000000-593,20241020000000,1,gamerant.com,https://gamerant.com/secret-level-cast-list/,,,,,"2#New York, United States#US#USNY#42.1497#-74....",...,"wc:633,c12.1:50,c12.10:63,c12.12:16,c12.13:32,...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"Secret Level,120;Secret Level,298;New York Com...","16,Hollywood stars voicing 15,137;15,episodes ...",,<PAGE_LINKS>https://gamerant.com/amazon-secret...


In [42]:
gkg.parse_urls() # parse urls in the gkg_query

In [43]:
# gkg.parse_gkg_field('amounts')
# gkg.parse_gkg_field('persons')
gkg.parse_gkg_field('v2persons')
# gkg.parse_gkg_field('themes')
# gkg.parse_gkg_field('v2tone')

Unnamed: 0,index,v2persons_0,v2persons_1
0,1,Arnold Schwarzenegger,1927
1,1,Arnold Schwarzenegger,2230
2,1,Laura Bailey,2095
3,1,Laura Bailey,2731
4,1,Tim Miller,871
...,...,...,...
9059,499,Parker Finn,2629
9060,499,Parker Finn,2656
9061,499,Parker Finn,5505
9062,499,Parker Finn,5549


In [44]:
gkg.tokenize_field(col_idx=1) # when calling this function, the tokenized field is given by col_idx input parameter
gkg.field_tokens[:5] + gkg.field_tokens[-5:]

['A Dragon Gaiden',
 'A Gylden Hunter',
 'A Nundo Mon',
 'Aaron Davis',
 'Aaron Dismuke',
 'Zoe Perry Mary',
 'Zoe Salda',
 'Zoe Saldana',
 'Zoey Mckenna',
 'Zola Gray Shepard']

In [45]:
gkg.vectorize_field()
# gkg.vectorized_df.columns = gkg.tokenize_field(col_idx=2) # resets columns to uncleaned field names
gkg.vectorized_df.head(2)

Unnamed: 0,a dragon gaiden,a gylden hunter,a nundo mon,aaron davis,aaron dismuke,aaron eckhart,aaron greenberg,aaron guzikowski,aaron hann,aaron hann mario miscione,...,zettai majuu sensen,zhang lei,zia newton,zoe miskelly,zoe perry,zoe perry mary,zoe salda,zoe saldana,zoey mckenna,zola gray shepard
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
gkg.parsed_urls[:5]

['the sims 4 red dead redemption 2 rhodes recreation',
 'secret level cast list',
 'jurassic world hammond masrani replacement soyona santos broker',
 'dragon ball sparking zero',
 'smile 2 entity more savage why director explained']

In [47]:
gkgvf = gkg.vectorized_df.copy()
# rename rows with urls
gkgvf.index = gkg.parsed_urls
gkgvf.head(2)


Unnamed: 0,a dragon gaiden,a gylden hunter,a nundo mon,aaron davis,aaron dismuke,aaron eckhart,aaron greenberg,aaron guzikowski,aaron hann,aaron hann mario miscione,...,zettai majuu sensen,zhang lei,zia newton,zoe miskelly,zoe perry,zoe perry mary,zoe salda,zoe saldana,zoey mckenna,zola gray shepard
the sims 4 red dead redemption 2 rhodes recreation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
secret level cast list,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# get percent of non-zero float values in the vectorized dataframe
non_zero = gkgvf.astype(bool).sum(axis=0)
non_zero = non_zero[non_zero > 0]
non_zero = non_zero / gkgvf.shape[0]*100
non_zero.sort_values(ascending=False, inplace=True)
non_zero[:10]

parker finn         2.0
peter parker        1.8
bruce wayne         1.8
kyle gallner        1.8
stephen king        1.8
james gunn          1.8
peter jacobson      1.8
tom hardy           1.6
robert downey jr    1.6
naomi scott         1.6
dtype: float64

In [49]:
gkg.vectorized_df.sum(axis=0).sort_values(ascending=False).head(5)

daryl dixon      129.0
young sheldon     53.0
harry potter      50.0
zeno robinson     44.0
rodney alcala     43.0
dtype: float64

In [50]:
gkg.get_all_soup(limit_output=10,verbose=True)
# gkg.get_all_soup(limit_output='all')

0
Title: Sims 4 Player Recreates Red Dead Redemption 2's Rhodes in the Game
Length of Header List: 7
First Header: Game Rant
Last Header: Trending Now
Length of Paragraph List: 8
First Paragraph: A talentedSims 4player online has recreated the town of Rhodes fromRed Dead Redemption 2in the game with incredible accuracy. Over a decade after its initial launch,The Sims 4remains the number one choice for players looking to bring their landscape ideas to life.
Last Paragraph: Unleash your imagination and create a unique world of Sims that’s an expression of you. Download for free, and customize every detail from Sims to homes and much more. Choose how Sims look, act, and dress, then decide how they’ll live out each day.Design and build incredible homes for every family, then decorate with your favorite furnishings and décor. Travel to different neighborhoods where you can meet other Sims and learn about their lives. Discover beautiful locations with distinctive environments and go on spont

In [51]:
asd = gkg.all_soup_data
asd
# gkg.all_soup_data['Paragraphs'][0]

Unnamed: 0,URL,Parsed URL,Titles,Headers,Paragraphs
0,https://gamerant.com/the-sims-4-red-dead-redem...,the sims 4 red dead redemption 2 rhodes recrea...,Sims 4 Player Recreates Red Dead Redemption 2'...,"[Game Rant, Sims 4 Player Recreates Red Dead R...",[A talentedSims 4player online has recreated t...
1,https://gamerant.com/secret-level-cast-list/,secret level cast list,Secret Level Cast Revealed,"[Game Rant, Secret Level Cast Revealed, Secre...","[During a panel at New York Comic Con 2024, Pr..."
2,https://screenrant.com/jurassic-world-hammond-...,jurassic world hammond masrani replacement soy...,Jurassic World Introduces Its Hammond & Masran...,"[Screen Rant, Jurassic World Introduces Its Ha...",[Warning: This article contains MAJOR SPOILERS...
3,https://screenrant.com/tag/dragon-ball-sparkin...,dragon ball sparking zero,Dragon Ball: Sparking! Zero | ScreenRant,"[Screen Rant, Dragon Ball: Sparking! Zero , \n...",[Fans of the franchise have many secrets to un...
4,https://screenrant.com/smile-2-entity-more-sav...,smile 2 entity more savage why director explained,Why Smile 2’s Entity Is “More Savage” Than In ...,"[Screen Rant, Why Smile 2’s Entity Is “More Sa...",[Warning: Some SPOILERS lie ahead for Smile 2!...
5,https://screenrant.com/batman-resurrection-jok...,batman resurrection joker plot hole fix explained,Batman 1989's New Sequel Admits To A Huge Plot...,"[Screen Rant, Batman 1989's New Sequel Admits ...",[Warning: This article contains SPOILERS for B...
6,https://screenrant.com/white-collar-neal-caffr...,white collar neal caffrey peter burke best rel...,Neal Caffrey & Peter Burke Have The Best Relat...,"[Screen Rant, Neal Caffrey & Peter Burke Have ...",[White Collaris a police procedural show that ...
7,https://screenrant.com/dragon-ball-sparking-ze...,dragon ball sparking zero dlc 2 new characters,What New Characters Are In Dragon Ball: Sparki...,"[Screen Rant, What New Characters Are In Drago...",[Dragon Ball: Sparking! Zerohas announced the ...
8,https://screenrant.com/demon-slayer-kamado-fam...,demon slayer kamado family tree explained,"Demon Slayer's Entire Kamado Family Tree, Expl...","[Screen Rant, Demon Slayer's Entire Kamado Fam...",[The Kamado family is the heart of theDemon Sl...
9,https://screenrant.com/action-movies-1980s-loa...,action movies 1980s loads of fun,10 Action Movies From The 1980s That Are Loads...,"[Screen Rant, 10 Action Movies From The 1980s ...",[The 1980s produced tons of great action movie...


In [52]:
gkg.soup.title.get_text()

'10 Action Movies From The 1980s That Are Loads Of Fun'

In [53]:
asd['Paragraphs'][0][:3]

['A talentedSims 4player online has recreated the town of Rhodes fromRed Dead Redemption 2in the game with incredible accuracy. Over a decade after its initial launch,The Sims 4remains the number one choice for players looking to bring their landscape ideas to life.',
 "Gamers have always been pushing the boundaries of what is possible to build inThe Sims 4, and with it remaining the most recent mainline game in the series, that doesn't look like stopping any time soon. The tools Maxis gave players back in 2014 are still fit for purpose in 2024, allowing players' imaginations to run wild with almost no limits on what can be built. While some look to create the best houses possible, others bring their favorite franchises into the game in creative ways. There was even oneSims 4player who builtGrand Theft Auto's Grove Streetin the game.",
 "Some players are up in arms over how heavily EA is showcasing The Sims 4's latest expansion pack and some of its free rewards."]

In [54]:
for item in asd['Paragraphs'][0]:
    idx = asd['Paragraphs'][0].index(item) + 1
    print(f'Paragraph {idx}: {item}')

Paragraph 1: A talentedSims 4player online has recreated the town of Rhodes fromRed Dead Redemption 2in the game with incredible accuracy. Over a decade after its initial launch,The Sims 4remains the number one choice for players looking to bring their landscape ideas to life.
Paragraph 2: Gamers have always been pushing the boundaries of what is possible to build inThe Sims 4, and with it remaining the most recent mainline game in the series, that doesn't look like stopping any time soon. The tools Maxis gave players back in 2014 are still fit for purpose in 2024, allowing players' imaginations to run wild with almost no limits on what can be built. While some look to create the best houses possible, others bring their favorite franchises into the game in creative ways. There was even oneSims 4player who builtGrand Theft Auto's Grove Streetin the game.
Paragraph 3: Some players are up in arms over how heavily EA is showcasing The Sims 4's latest expansion pack and some of its free rew

## Zero-shot Classification

In [55]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli", device=device_id)

In [56]:
# candidate labels
# labels = ['one piece', 'manga', 'tv', 'anime', 'luffy', 'not one piece', 'comics']
labels = ['one piece', 'other', 'manga', 'commentary', 'interview']
title = "One Piece: Sun God Loki Vs Sun God Nika, Explained"
# title = "My Hero Academia's Trina Nishimura, Zeno Robinson, and Jessie Grelle Interview"
pipe(title, candidate_labels = labels,)


{'sequence': 'One Piece: Sun God Loki Vs Sun God Nika, Explained',
 'labels': ['one piece', 'manga', 'other', 'commentary', 'interview'],
 'scores': [0.6142399907112122,
  0.36122259497642517,
  0.011622573249042034,
  0.011277785524725914,
  0.0016370095545426011]}

In [67]:
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Initialize the zero-shot classification pipeline
pipe = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device_id)

# Define your titles and candidate labels
titles = asd['Titles'].tolist()  # asd is gkg.all_soup_data
# titles = asd['Parsed URL'].tolist()
# labels = ['one piece','manga', 'commentary', 'interview','news','podcast','forecast']
labels = ['tv', 'movie', 'radio', 'interview', 'news', 'episodes', 'game', 'anime', 'manga', 'actors', 'podcast']
# Create a Hugging Face Dataset
data = {"title": titles}
dataset = Dataset.from_dict(data)

# Define a function to classify the titles
def classify(example):
    result = pipe(example['title'], candidate_labels=labels)
    return {
        'labels': result['labels'],
        'scores': result['scores']
    }

# Apply classification to the entire dataset in a batched manner
results = dataset.map(classify, batched=False)  # Use batched=True for faster processing

# Convert results to a DataFrame
df_results = pd.DataFrame(columns=labels)

# Populate the DataFrame with the classification scores
for idx, row in enumerate(results):
    label_scores = dict(zip(row['labels'], row['scores']))
    df_results.loc[idx] = [label_scores.get(label, 0) for label in labels]

# Set the 'title' as the index of the DataFrame
df_results['title'] = titles  # Add titles to the DataFrame
df_results.set_index('title', inplace=True)  # Set titles as the index

# Now df_results contains the classification scores for each title, with titles as the index
df_results


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Unnamed: 0_level_0,tv,movie,radio,interview,news,episodes,game,anime,manga,actors,podcast
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Sims 4 Player Recreates Red Dead Redemption 2's Rhodes in the Game,0.014153,0.016853,0.011748,0.01173,0.06683,0.01576,0.83439,0.003796,0.004924,0.008512,0.011303
Secret Level Cast Revealed,0.011751,0.023233,0.015572,0.010961,0.708868,0.026796,0.032977,0.005857,0.005822,0.101286,0.056877
"Jurassic World Introduces Its Hammond & Masrani Replacement, And They Don't Want To Build A Park",0.002771,0.687273,0.001876,0.012537,0.250575,0.01435,0.016839,0.000935,0.001334,0.006868,0.004644
Dragon Ball: Sparking! Zero | ScreenRant,0.082635,0.189303,0.005173,0.006267,0.090353,0.030132,0.091111,0.469125,0.024338,0.005312,0.00625
Why Smile 2’s Entity Is “More Savage” Than In First Film Explained By Director,0.019996,0.470667,0.011941,0.077108,0.282288,0.031024,0.039726,0.013581,0.008288,0.01138,0.034001
Batman 1989's New Sequel Admits To A Huge Plot Hole In Tim Burton's Original & Finally Fixes It,0.007685,0.543857,0.003909,0.028919,0.300896,0.028311,0.054048,0.00202,0.003037,0.01478,0.012538
"Neal Caffrey & Peter Burke Have The Best Relationship In White Collar, And None Of The Others Come Close",0.501128,0.097097,0.00722,0.031866,0.075505,0.0741,0.036548,0.002458,0.004204,0.140244,0.029629
What New Characters Are In Dragon Ball: Sparking! ZERO Season Pass DLC 2,0.018795,0.042813,0.009917,0.013647,0.350607,0.004648,0.164663,0.365798,0.011739,0.003086,0.014286
"Demon Slayer's Entire Kamado Family Tree, Explained",0.021602,0.04521,0.011729,0.072479,0.099286,0.065782,0.02541,0.024513,0.164371,0.013444,0.456174
10 Action Movies From The 1980s That Are Loads Of Fun,0.011682,0.714254,0.006485,0.023611,0.037042,0.018425,0.142551,0.008422,0.007362,0.016544,0.013622


# Zero-shot Classifier Example

In [58]:
from transformers import pipeline

# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device_id)

# Define your headlines and candidate labels
headlines = [
    "New smartphone model released with advanced AI features",
    "Local team wins championship in overtime thriller",
    "Government announces new economic policy reforms"
]

# Define possible categories
candidate_labels = ["Technology", "Sports", "Politics"]

# Classify each headline
results = [classifier(headline, candidate_labels) for headline in headlines]

# Print the classification results
for i, result in enumerate(results):
    print(f"Headline: {headlines[i]}")
    print(f"Label: {result['labels'][0]} (Score: {result['scores'][0]:.2f})")
    print("-" * 30)


Headline: New smartphone model released with advanced AI features
Label: Technology (Score: 0.95)
------------------------------
Headline: Local team wins championship in overtime thriller
Label: Sports (Score: 0.97)
------------------------------
Headline: Government announces new economic policy reforms
Label: Politics (Score: 0.88)
------------------------------


In [24]:
!nvidia-smi

Mon Nov 11 10:21:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA RTX A6000               On  | 00000000:2E:00.0 Off |                  Off |
| 30%   31C    P8              26W / 300W |  17062MiB / 49140MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA RTX A6000               On  | 00000000:41:0