In [1]:
import os
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs

import warnings

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=UserWarning, module="gdelt.parallel")
    # The code that triggers warnings


here


In [2]:

# GDELT GKG Operator
class gkg_operator:
    def __init__(self):
        self.gdv2 = gdelt.gdelt(version=2) # Instantiate the GDELT object for searches
        self.current_date = date.today().strftime('%Y %b %d')
        self.search_date = [self.current_date]

    def set_date(self, date):
        """
        date format: 'YYYY MM DD', ex. '2021 Jan 01'
        """
        if isinstance(date, str):
            self.search_date = [date]
        elif isinstance(date, list):
            self.search_date = date
        else:
            print('Invalid date input. Please input a string or list of strings in the format "YYYY MM DD"')

    def get_gkg(self):
        self.gkg_query = self.gdv2.Search(date=self.search_date, table='gkg', normcols = True, coverage=True)


    def gkg_persons(self, persons):
        df = self.gkg_query.copy()
        df.dropna(subset=['v2persons'], inplace=True)
        if isinstance(persons, list):
            # format to regex or expression
            persons = '|'.join(persons)
        
        df = df[df['v2persons'].str.contains(persons, case=False)]
        return df

In [3]:
gkg = gkg_operator() # create a gkg operator
gkg.current_date

'2024 Oct 24'

In [4]:
gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
gkg.search_date

['2024 Oct 20', '2024 Oct 21']

In [5]:
gkg.get_gkg()

In [6]:
gkg.gkg_query.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216026 entries, 0 to 216025
Data columns (total 27 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   gkgrecordid                 216026 non-null  object 
 1   date                        216026 non-null  int64  
 2   sourcecollectionidentifier  216026 non-null  int64  
 3   sourcecommonname            216026 non-null  object 
 4   documentidentifier          216026 non-null  object 
 5   counts                      25501 non-null   object 
 6   v2counts                    25501 non-null   object 
 7   themes                      192134 non-null  object 
 8   v2themes                    192134 non-null  object 
 9   locations                   162017 non-null  object 
 10  v2locations                 161808 non-null  object 
 11  persons                     168206 non-null  object 
 12  v2persons                   166261 non-null  object 
 13  organizations 

In [7]:
gkg.gkg_query.head()

Unnamed: 0,gkgrecordid,date,sourcecollectionidentifier,sourcecommonname,documentidentifier,counts,v2counts,themes,v2themes,locations,...,gcam,sharingimage,relatedimages,socialimageembeds,socialvideoembeds,quotations,allnames,amounts,translationinfo,extras
0,20241020000000-0,20241020000000,1,bostonherald.com,https://www.bostonherald.com/2024/10/19/kamala...,,,RURAL;USPEC_POLITICS_GENERAL1;EPU_POLICY;EPU_P...,"USPEC_POLITICS_GENERAL1,218;EPU_POLICY_CONGRES...","2#Maine, United States#US#USME#44.6074#-69.397...",...,"wc:48,c12.10:5,c12.13:4,c12.14:1,c12.8:4,c12.9...",https://www.bostonherald.com/wp-content/upload...,,,,,,,,<PAGE_LINKS>https://checkout.bostonherald.com/...
1,20241020000000-1,20241020000000,1,cbsnews.com,https://www.cbsnews.com/boston/news/volleyball...,,,EDUCATION;,"EDUCATION,35;EDUCATION,93;EDUCATION,385;EDUCAT...",1#Angola#AO#AO#-12.5#18.5#AO;1#Portugal#PO#PO#...,...,"wc:502,c12.1:57,c12.10:44,c12.12:11,c12.13:14,...",https://assets1.cbsnewsstatic.com/hub/i/r/2024...,,,https://youtube.com/CBSBoston/;,,"Newton South High School,30;Newton South High ...",,,<PAGE_ALTURL_AMP>https://www.cbsnews.com/amp/b...
2,20241020000000-2,20241020000000,1,legallyindia.com,https://www.legallyindia.com/convos/topic/4006...,,,TRIAL;TAX_FNCACT;TAX_FNCACT_LAWYERS;LEGISLATIO...,"TAX_FNCACT_MODERATORS,998;TRIAL,76;TAX_FNCACT_...",,...,"wc:169,c12.1:26,c12.10:24,c12.12:5,c12.13:10,c...",,,,,,,,,<PAGE_LINKS>https://www.legallyindia.com/convo...
3,20241020000000-3,20241020000000,1,kgw.com,https://www.kgw.com/article/news/nation-world/...,,,TAX_DISEASE;TAX_DISEASE_LISTERIA;TAX_FNCACT;TA...,"TAX_FOODSTAPLES_MEAT,1532;GENERAL_HEALTH,655;G...","3#Washington, Washington, United States#US#USD...",...,"wc:339,c12.1:13,c12.10:42,c12.12:21,c12.13:10,...",https://media.kgw.com/assets/CCT/images/d78ac3...,,,,,"House Foods,447;Great Value,835;Food Lion,905;...","500,types of frozen waffles,101;100,of frozen ...",,<PAGE_LINKS>https://www.cdc.gov/listeria/about...
4,20241020000000-4,20241020000000,1,delphosherald.com,https://www.delphosherald.com/news/state_natio...,,,TAX_ETHNICITY;TAX_ETHNICITY_AMERICANS;EDUCATIO...,"TAX_FNCACT_STUDENTS,455;ELECTION,203;EDUCATION...","2#California, United States#US#USCA#36.17#-119...",...,"wc:119,c1.3:1,c12.1:6,c12.10:4,c12.12:2,c12.13...",https://bloximages.chicago2.vip.townnews.com/d...,,,https://youtube.com/channel/UC05lbn0KutzwDoSX6...,797|115||%96J H2?E E@ 36 : ?G@=G65 : ? E96 : C...,"Center Square,21","200,likely voters,160;2,=@E @7 >6DD28,878;",,<PAGE_AUTHORS>Brendan Clarey | The Center Squa...


In [8]:
persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
OP = gkg.gkg_persons(persons)
OP.reset_index(inplace=True)
OP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       14 non-null     int64  
 1   gkgrecordid                 14 non-null     object 
 2   date                        14 non-null     int64  
 3   sourcecollectionidentifier  14 non-null     int64  
 4   sourcecommonname            14 non-null     object 
 5   documentidentifier          14 non-null     object 
 6   counts                      0 non-null      object 
 7   v2counts                    0 non-null      object 
 8   themes                      12 non-null     object 
 9   v2themes                    12 non-null     object 
 10  locations                   4 non-null      object 
 11  v2locations                 4 non-null      object 
 12  persons                     14 non-null     object 
 13  v2persons                   14 non-nu

In [9]:
OP['sourcecommonname'].value_counts()

gamerant.com      7
screenrant.com    5
comicbook.com     2
Name: sourcecommonname, dtype: int64

In [10]:
OP_gamerant = OP[OP['sourcecommonname'] == 'gamerant.com']
OP_gamerant.reset_index()

Unnamed: 0,level_0,index,gkgrecordid,date,sourcecollectionidentifier,sourcecommonname,documentidentifier,counts,v2counts,themes,...,gcam,sharingimage,relatedimages,socialimageembeds,socialvideoembeds,quotations,allnames,amounts,translationinfo,extras
0,0,5370,20241020014500-92,20241020014500,1,gamerant.com,https://gamerant.com/one-piece-sun-god-loki-ni...,,,MANMADE_DISASTER_IMPLIED;ARMEDCONFLICT;UNGP_FO...,...,"wc:1216,c1.1:1,c1.3:1,c12.1:153,c12.10:154,c12...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"Sun God,230;One Piece,613;Sun God Nika,985;Sun...","2,Sun Gods,146;2,Sun Gods,3619;2,legendary pir...",,<PAGE_LINKS>https://gamerant.com/one-piece-elb...
1,4,67774,20241020214500-270,20241020214500,1,gamerant.com,https://gamerant.com/one-piece-oda-announces-t...,,,TAX_WORLDLANGUAGES;TAX_WORLDLANGUAGES_MANGA;TA...,...,"wc:1199,c12.1:102,c12.10:134,c12.12:41,c12.13:...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"One Piece,59;Live Action,86;Elbaf Arc,176;One ...","2,projects during One Piece,26;10,Q A session,...",,<PAGE_LINKS>https://gamerant.com/one-piece-elb...
2,5,73373,20241020231500-814,20241020231500,1,gamerant.com,https://gamerant.com/one-piece-1131-spoilers-r...,,,PIRACY;TAX_FNCACT;TAX_FNCACT_PIRATES;ARMEDCONF...,...,"wc:1231,c1.2:1,c12.1:165,c12.10:170,c12.12:44,...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"One Piece,77;Elbaf Arc,275;Straw Hat Pirates,3...","2,legendary warriors,3802;3,weeks for the next...",,<PAGE_LINKS>https://gamerant.com/one-piece-bes...
3,6,75375,20241021000000-219,20241021000000,1,gamerant.com,https://gamerant.com/one-piece-chapter-1130-od...,,,TAX_FNCACT;TAX_FNCACT_PRINCE;PIRACY;TAX_FNCACT...,...,"wc:1313,c1.1:3,c1.4:1,c12.1:111,c12.10:133,c12...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"Piece Chapter,33;Accursed Prince,77;Straw Hat ...","2,Major Projects During The,532;2,major projec...",,<PAGE_LINKS>https://gamerant.com/one-piece-113...
4,7,79648,20241021010000-585,20241021010000,1,gamerant.com,https://gamerant.com/hunter-x-hunter-404-spoil...,,,ARMEDCONFLICT;EPU_CATS_NATIONAL_SECURITY;TAX_F...,...,"wc:1270,c1.3:1,c12.1:117,c12.10:189,c12.12:62,...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"Succession War,342;Succession War,565;Longhi N...","404,will be an intense,288;26,degrees #x2026,1...",,<PAGE_LINKS>https://gamerant.com/hunter-x-hunt...
5,11,202608,20241021220000-973,20241021220000,1,gamerant.com,https://gamerant.com/one-piece-oda-reveals-lok...,,,TAX_FNCACT;TAX_FNCACT_FATHER;ARREST;PIRACY;TAX...,...,"wc:1245,c1.1:1,c1.2:2,c1.4:4,c12.1:147,c12.10:...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"Devil Fruit,90;One Piece,189;One Piece,247;Pie...",,,<PAGE_LINKS>https://gamerant.com/one-piece-cha...
6,13,208244,20241021224500-1005,20241021224500,1,gamerant.com,https://gamerant.com/one-piece-chapter-1131-de...,,,DELAY;USPEC_UNCERTAINTY1;CRISISLEX_C04_LOGISTI...,...,"wc:1112,c12.1:104,c12.10:121,c12.12:30,c12.13:...",https://static0.gamerantimages.com/wordpress/w...,https://static0.gamerantimages.com/wordpress/w...,,https://youtube.com/channel/UCkZjsmAQnXfS-_5lw...,,"One Piece Live Action,151;Elbaf Arc,349;One Pi...","2,weeks,42;2,Week Break,1148;2,weeks off One P...",,<PAGE_LINKS>https://gamerant.com/one-piece-cha...


In [11]:
url = OP['documentidentifier'].iloc[1]
url

'https://screenrant.com/one-piece-unexpected-character-buggy-beauty-of-writing/'

In [12]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
session = requests.Session()
retry = Retry(connect=2, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

try:
    response = session.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

In [13]:
soup.title

<title>One Piece Just Demonstrated The True Beauty of Oda's Writing Through Its Most Unexpected Character</title>

In [14]:
soup.title.string

"One Piece Just Demonstrated The True Beauty of Oda's Writing Through Its Most Unexpected Character"

In [15]:
# soup.find_all('h1').__getitem__(1).get_text()
# this header seems to be the source
soup.find_all('h1')[0].get_text()

'Screen Rant'

**Game Rant** articles have a common paragraph structure for the head and tail. So far they all contain:
`This article contains spoilers`

In [16]:
soup.find_all('p')

[<p class="user-msg">Your changes have been saved</p>,
 <p class="user-msg"></p>,
 <p class="user-msg">Email is sent</p>,
 <p class="user-msg">Email has already been sent</p>,
 <p class="user-msg">Please verify your email address.</p>,
 <p class="user-msg">You’ve reached your account maximum for followed topics.</p>,
 <p>Despite occasional complaints about its pacing,<strong><em> <a href="https://screenrant.com/tag/one-piece/" target="_blank">One Piece</a></em></strong> is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories. That said, one unexpected character best encapsulates the <a href="https://screenrant.com/one-piece-big-flaw-manga-makes-it-so-interesting-to-read/" target="_blank">beauty of Eiichiro Oda's writing</a>, and it isn't one fans would expect.</p>,
 <p>Episode #1116 of <em>One Piece</em> sheds new light on Buggy, who, despite being a pat

In [17]:
paragraphs = soup.find_all('p')
store_text = []
for p in paragraphs:
    # paste the strings into a single string
    store_text.append(p.get_text())
#     print(p.get_text())

store_text = ' '.join(store_text)
store_text
# remove '\xa0 \n\t'
store_text = re.sub(r'\xa0|\n|\t|\\', '', store_text)
# replace end qoutes with double quotes
# store_text = re.sub(r''', "'", store_text)


In [18]:
store_text[:1000]

"Your changes have been saved  Email is sent Email has already been sent Please verify your email address. You’ve reached your account maximum for followed topics. Despite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories. That said, one unexpected character best encapsulates the beauty of Eiichiro Oda's writing, and it isn't one fans would expect. Episode #1116 of One Piece sheds new light on Buggy, who, despite being a pathetic gag character for a majority of the series, is finally portrayed in an admirable light. The episode highlights Buggy's dreams and aspirations as well as the hidden tragedy within his character, adding an unexpected layer of complexity to him while also making him seem almost respectable for the very first time in the series. More than anything, the episode portrays just how

In [19]:
store_text.__str__()
# print(repr(store_text)) # returning store_text give a representational form which shows the escape characters "\"
# store_text.__repr__()
print(str(store_text)) # returns 

Your changes have been saved  Email is sent Email has already been sent Please verify your email address. You’ve reached your account maximum for followed topics. Despite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories. That said, one unexpected character best encapsulates the beauty of Eiichiro Oda's writing, and it isn't one fans would expect. Episode #1116 of One Piece sheds new light on Buggy, who, despite being a pathetic gag character for a majority of the series, is finally portrayed in an admirable light. The episode highlights Buggy's dreams and aspirations as well as the hidden tragedy within his character, adding an unexpected layer of complexity to him while also making him seem almost respectable for the very first time in the series. More than anything, the episode portrays just how 

# Tokenize with NLTK

In [20]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
# nltk.download('punkt')

In [21]:
sent_tokens = nltk.sent_tokenize(store_text)

In [22]:
sent_tokens

['Your changes have been saved  Email is sent Email has already been sent Please verify your email address.',
 'You’ve reached your account maximum for followed topics.',
 'Despite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories.',
 "That said, one unexpected character best encapsulates the beauty of Eiichiro Oda's writing, and it isn't one fans would expect.",
 'Episode #1116 of One Piece sheds new light on Buggy, who, despite being a pathetic gag character for a majority of the series, is finally portrayed in an admirable light.',
 "The episode highlights Buggy's dreams and aspirations as well as the hidden tragedy within his character, adding an unexpected layer of complexity to him while also making him seem almost respectable for the very first time in the series.",
 'More than anything, the 

In [23]:
summaries = {}

def three_sentence_summary(text):
    sent_tokens = nltk.sent_tokenize(text)
    summary = "\n".join(sent_tokens[:3])
    return summary

summaries["baseline"] = three_sentence_summary(store_text)

print(summaries["baseline"])

Your changes have been saved  Email is sent Email has already been sent Please verify your email address.
You’ve reached your account maximum for followed topics.
Despite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories.


# Transformers

In [24]:
#! pip install --upgrade transformers
#! pip install --upgrade torch
# !pip install --upgrade tf-keras


In [25]:
from transformers import pipeline, set_seed

In [26]:
!pip show transformers

Name: transformers
Version: 4.45.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: C:\Users\cld1465\AppData\Roaming\Python\Python311\site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [27]:
set_seed(42)
pipe = pipeline('text-generation', model='gpt2-xl')
# pipe = pipeline('text-generation', model='facebook/bart-large-cnn')





In [28]:
from transformers import TRANSFORMERS_CACHE
import os

# Print the cache directory (default cache location)
cache_dir = os.getenv('TRANSFORMERS_CACHE', os.path.expanduser('~/.cache/huggingface/transformers'))
print(f"Cache directory: {cache_dir}")



Cache directory: C:\Users\cld1465/.cache/huggingface/transformers


In [29]:
!ls ~/.cache/huggingface/

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [30]:
!ls -R ~/.cache/huggingface/transformers/


'ls' is not recognized as an internal or external command,
operable program or batch file.


In [31]:
from transformers.utils import logging

# This will show the cache directory used by the transformers library
print(logging.get_verbosity())


30


In [32]:
len(store_text)

7172

In [33]:
gpt2_query = store_text[:3500] + "\nTL;DR:\n"
# pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True, truncation=True)
pipe_out = pipe(gpt2_query, max_new_tokens=100, clean_up_tokenization_spaces=True, truncation=True)
summaries["gpt2"] = "\n".join(sent_tokenize(pipe_out[0]['generated_text'][len(gpt2_query) :]))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [34]:
# Check the length of the tokenized input
tokenized_input = pipe.tokenizer.encode(gpt2_query, truncation=True)
if len(tokenized_input) > 1024:
    print(f"Input length: {len(tokenized_input)} tokens, which exceeds the max model length of 1024 tokens.")


In [35]:
len(tokenized_input)

747

In [36]:
summaries

{'baseline': 'Your changes have been saved  Email is sent Email has already been sent Please verify your email address.\nYou’ve reached your account maximum for followed topics.\nDespite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories.',
 'gpt2': "To be honest, you can't beat this scene.\nEpisode #1135 is the longest episode of the series and arguably the episode that delivers the most plot: the birth of Brook, Zoro's failed attempt to save him, and the infamous fight with the Marines.\nAs always, this episode is incredibly dense and complex, even if only in the broadest sense such as the birth of Brook, this episode is filled with a plethora of character developments, world-building, and plenty of action."}

In [37]:
print(summaries['baseline'])

Your changes have been saved  Email is sent Email has already been sent Please verify your email address.
You’ve reached your account maximum for followed topics.
Despite occasional complaints about its pacing, One Piece is often agreed to be one of the most well-written shōnen manga, not only in terms of its storytelling, but also its world-building, well-rounded characters, complex themes, and backstories.


In [38]:
print(summaries['gpt2'])

To be honest, you can't beat this scene.
Episode #1135 is the longest episode of the series and arguably the episode that delivers the most plot: the birth of Brook, Zoro's failed attempt to save him, and the infamous fight with the Marines.
As always, this episode is incredibly dense and complex, even if only in the broadest sense such as the birth of Brook, this episode is filled with a plethora of character developments, world-building, and plenty of action.


# Beautifulsoup Tools

In [39]:
soup.find_all('h2')

[<h2 id="one-piece-39-s-final-saga-completely-recontextualizes-buggy-39-s-character"> One Piece's Final Saga Completely Recontextualizes Buggy's Character </h2>,
 <h2 id="buggy-39-s-flashback-with-shanks-reveals-a-sad-side-to-his-character"> Buggy's Flashback With Shanks Reveals a Sad Side to His Character </h2>]

In [40]:
soup.find_all('h3')[1].get_text()

" Chapter #1082 and Episode #1116 Highlight Buggy's Most Admirable Trait "

In [41]:
soup.find_all('h2')[0].find_next('p').get_text()

"Despite being one of the first real antagonists of the series, Buggy the Clown has gradually devolved into one of One Piece's longest-running gags. Through a series of hilarious misunderstandings, Buggy has continued to fail upwards, unwittingly keeping up with Luffy ever since the beginning of the series. His constant failures coupled with his cowardice make him a character that is difficult to take seriously, and he spends most of One Piece being a one-note pompous clown."

In [42]:
soup.find_all('h3')[2].find_next('p').get_text()

'Ever since Buggy was first introduced, he has been known to possess a grudge against Shanks and while the grudge seemed shallow at first, the flashback in chapter #1082 expands on Buggy\'s complex feelings about Shanks. Roger was known to be a father figure to both Buggy and Shanks, but as the flashback reveals, Shanks seemed to be the golden child from the start, who "shone with potential," as Buggy puts it.'