In [1]:
import os
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# GPU Timing (using GPU 1) else -1 for CPU
device_id = 1 if torch.cuda.is_available() else -1 

here


# Get GKG and Test URL

In [2]:
gkg = gkg_operator() # create a gkg operator
gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
gkg.get_gkg()
persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
OP = gkg.gkg_persons(persons)
OP.reset_index(inplace=True)
url = OP['documentidentifier'].iloc[0]
url

# I tried to suppress this warning which occurs when querying from the lambda2 server, didn't work, so I dropped the warning code.

'https://gamerant.com/one-piece-sun-god-loki-nika-luffy/'

# Get URL Article Text

In [3]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
session = requests.Session()
retry = Retry(connect=2, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

try:
    response = session.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

## Article Title

In [4]:
soup.title.string

'One Piece: Sun God Loki Vs Sun God Nika, Explained'

## Article Paragraphs

In [5]:
paragraphs = soup.find_all('p')
store_text = []
for p in paragraphs:
    # paste the strings into a single string
    store_text.append(p.get_text())
#     print(p.get_text())

store_text = ' '.join(store_text)
store_text
# remove '\xa0 \n\t'
store_text = re.sub(r'\xa0|\n|\t|\\', '', store_text)
print(store_text)

Your changes have been saved  Email is sent Email has already been sent Please verify your email address. You’ve reached your account maximum for followed topics. This article contains spoilers from One Piece's Elbaf Arc. One Piece chapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent with the reveal of Loki so early into the arc. Truly, few were expecting Loki to be revealed in such fashion in One Piece chapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be. Loki is perhaps the most powerful giant. His strength might be on par with a Yonko in One Piece. In this chapter, fans even got what can be considered to be a buildup of sorts towards a fight between Luffy, who is the Sun God Nika, and Loki, who also believes himself to be the Sun God. It looks like there is going to be a clash of Sun Gods after One Piece chapter 1130 and it is c

In [6]:
len(store_text)

8041

# Get Article Summary

In [7]:
set_seed(42)
# Initialize a summarization pipeline using BART
# pipe_summary = pipeline('summarization', model='facebook/bart-large-cnn', device=1, truncation=True)

pipe_summary = pipeline('summarization', model='t5-large', device=1, truncation=True)

gpt_query = store_text[200:3500] + "\nTL;DR:\n"
gpt_query_cleaned = re.sub(r'[^\x00-\x7F]+', ' ', gpt_query)  # Removes non-ASCII characters
# Summarize the text
output_summary = pipe_summary(gpt_query_cleaned, max_length=500, min_length=300, truncation=True)
# output_summary = pipe_summary(store_text, max_length=500, truncation=True)

# Print the summary
print(output_summary[0]['summary_text'])


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Loki is perhaps the most powerful giant in ne Piece, and his strength might be on par with a Yonko . Loki believes himself to be the Sun God, and the one who will destroy the world . this leads one to believe that in the culture of the Elbaf Giants, there might be a prophecy that the sun god will eventually come back and calm some sort of disaster . if Loki does break free from his chains, it will certainly be one to watch .- n a n-  nn  aa a- -n-nnnen - ..  -en en-a en aenaenena .en nenen  enaaon enenenononon ifenaon-on aona-on-ena-aaen o en.aaa.aonon, at this will be to the to his power, to ---on, -ona a -- -on- to- ... -a- a, ...a-, . I-s-onon-  .s a... a. ., s- s , 


## Try Chunking the Article

In [8]:
max_chunk_length = 1000  # Adjust based on model capacity and token limits
chunks = [store_text[i:i + max_chunk_length] for i in range(0, len(store_text), max_chunk_length)]
summaries = [pipe_summary(chunk, max_length=150, min_length=50, truncation=True)[0]['summary_text'] for chunk in chunks]

# Combine summaries
full_summary = " ".join(summaries)
print(full_summary)


Your max_length is set to 150, but your input_length is only 10. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=5)


this article contains spoilers from one piece's elbaf arc . few were expecting Loki to be revealed in such fashion in one piece chapter 1130 . fans even got what can be considered a buildup of sorts towards a fight between Luffy and the Sun God Nika . ertainly going to be one worth watching, provided Loki does break free from his chains . this is a meeting that fans have been anticipating since the Elbaf arc began . it's also going to show how strong Loki truly is . Loki looks like a mixture of Kaido, doflamingo, and Katakuri . there is more to Loki than meets the eye, and fans will slowly find out more about him with time . the character design itself personifies the strength of the highest tier . pter 1130 related to Loki was the fact that he called himself the sun god . this leads one to believe that in the culture of the Elbaf Giants, there might be a prophecy that the Sun God will eventually come back and calm some sort of disaster . naturally, fans know that neither of the Giants

## Chunking by Paragraph Structure
Chunking the article seems to help get a better summary due to smaller token packets.

Perhaps chunking the article paragraphs into their blocked header sections would be useful.

In [9]:
# Parse the document by headers and associated paragraphs
header_tags = ['h1', 'h2', 'h3']  # Modify based on structure (e.g., h2 for main sections)
sections = {}

current_header = None
for element in soup.find_all(['h1', 'h2', 'h3', 'p']):  # Include headers and paragraphs
    if element.name in header_tags:
        # Set the current header as key in sections dict
        current_header = element.get_text(strip=True)
        sections[current_header] = []  # Initialize list for paragraphs
    elif element.name == 'p' and current_header:
        # Append paragraph text to the current header section
        sections[current_header].append(element.get_text(strip=True))

# Clean up and format each section
for header, paragraphs in sections.items():
    # Join paragraphs and clean unwanted characters
    joined_text = ' '.join(paragraphs)
    sections[header] = re.sub(r'\xa0|\n|\t|\\', '', joined_text)

# Example output for one header
for header, content in sections.items():
    print(f"Section: {header}")
    print(content)  # Print first 500 chars for each section


Section: Game Rant

Section: One Piece: Sun God Loki Vs Sun God Luffy, Explained
Your changes have been saved  Email is sent Email has already been sent Please verify your email address. You’ve reached your account maximum for followed topics.
Section: Related
This article contains spoilers from One Piece's Elbaf Arc.
Section: Key Takeaways
One Piecechapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent withthe reveal of Loki so early into the arc. Truly, few were expecting Loki to be revealed in such fashion inOne Piecechapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be. Loki is perhaps the most powerful giant. His strength might be on par with a Yonko in One Piece. In this chapter, fans even got what can be considered to be a buildup of sorts towards a fight between Luffy, who is the Sun God Nika, and Loki, who also believes himse

In [10]:
header_tags = ['h1', 'h2', 'h3']  # Headers to define sections
sections = {}

# Initialize summarization pipeline
set_seed(42)
pipe_summary = pipeline('summarization', model='t5-large', device=1)

current_header = None
for element in soup.find_all(['h1', 'h2', 'h3', 'p']):
    if element.name in header_tags:
        current_header = element.get_text(strip=True)
        sections[current_header] = []  # Create list for paragraphs
    elif element.name == 'p' and current_header:
        # Append paragraph text to the current header section
        sections[current_header].append(element.get_text(strip=True))

# Summarize content for each header section
summarized_sections = {}
for header, paragraphs in sections.items():
    if paragraphs:  # Skip headers with no content
        joined_text = ' '.join(paragraphs)
        clean_text = re.sub(r'\xa0|\n|\t|\\', '', joined_text)
        
        # Summarize the clean text for this header
        summary = pipe_summary(clean_text, max_length=150, min_length=50, truncation=True)
        
        # Store summary with header
        summarized_sections[header] = summary[0]['summary_text']

# Display summarized content by section
for header, summary in summarized_sections.items():
    print(f"Section: {header}")
    print(summary)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Your max_length is set to 150, but your input_length is only 33. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)
Your max_length is set to 150, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)


Section: One Piece: Sun God Loki Vs Sun God Luffy, Explained
your changes have been saved Email is sent Email has already been sent Please verify your email address . you’ve reached your account maximum for followed topics . if you haven’t already, please create a new topic .
Section: Related
this article contains spoilers from one piece's elbaf arc . a new episode of one piece will be released on july 14th . the first episode of the series will be aired on june 14th, 2014 .
Section: Key Takeaways
Loki is perhaps the most powerful giant in one piece . his strength might be on par with a Yonko . fans even got what can be considered a buildup of sorts towards a fight between Luffy and Loki .
Section: Loki And Luffy's First Meeting In One Piece
Loki and Luffy meet for the very first time in one piece chapter 1130 . this is a meeting that fans have been anticipating since the Elbaf arc began . Loki looks quite like a mixture of Kaido, doflamingo, and Katakuri .
Section: Loki and Luffy Are 

Some paragraphs from sections are less than the maxlength output, so we need to make the max_length a function of the current paragraph, instead of fixed at `max_length=150`. The `min_length` has to be less than the input string or it is not a summary.