In [125]:
import os
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import gdelt # for gdelt searchs
# from gdelt.gkg_tools import * # for gkg searchs
%run "../gkg_tools.py" # using magic command run to access the script from the parent directory


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# GPU Timing (using GPU 1) else -1 for CPU
device_id = 1 if torch.cuda.is_available() else -1 

Running gkg_tools.py


# Get Article from GDELT

In [204]:
gkg = gkg_operator() # create a gkg operator
gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
gkg.get_gkg()
persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
OP = gkg.gkg_persons(persons)
OP.reset_index(inplace=True)



In [287]:
url = OP['documentidentifier'].iloc[0]
url

# I tried to suppress this warning which occurs when querying from the lambda2 server, didn't work, so I dropped the warning code.

'https://gamerant.com/one-piece-sun-god-loki-nika-luffy/'

In [288]:

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
session = requests.Session()
retry = Retry(connect=2, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

try:
    response = session.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

## Headers

In [289]:
headers = soup.find_all('h1')
header_text = []
for i in range(1,5):
    headers = soup.find_all(f'h{i}')
    for h in headers:
        header_text.append(h.get_text())
    
header_text

['Game Rant',
 'One Piece: Sun God Loki Vs Sun God Luffy, Explained',
 " Loki And Luffy's First Meeting In One Piece ",
 ' Why Loki Will Test His Strength Against Luffy ',
 ' Will Loki and Luffy Be Friends Or Enemies? ',
 'Will Loki Be Yonko Level?',
 'Related',
 'Key Takeaways',
 ' Loki and Luffy Are Set To Clash in One Piece ',
 ' Loki and Luffy Will Start As Enemies ',
 'Trending Now']

## Paragraphs

In [290]:
paragraphs = soup.find_all('p')
paragraphs[7]

<p><em><strong>One Piece</strong></em> chapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent with <a href="https://gamerant.com/one-piece-loki-potential-legendary-devil-fruit-explained/" target="_blank">the reveal of Loki so early into the arc</a>. Truly, few were expecting Loki to be revealed in such fashion in <a href="https://gamerant.com/tag/one-piece/" target="_blank"><em>One Piece</em></a> chapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be.</p>

In [291]:
para_text = []
# for idx, p in enumerate(paragraphs[4:14]):
for idx, p in enumerate(paragraphs):
    print(idx, len(p), len(p.get_text())) # use enumerate to track line number visually
    para_text.append(p.get_text())

print(f'Number of Paragraphs:{len(para_text)}')
para_text[4:14]

0 1 28
1 0 0
2 1 13
3 1 27
4 1 33
5 1 56
6 1 58
7 6 395
8 1 96
9 3 372
10 3 699
11 1 57
12 3 672
13 1 423
14 3 534
15 1 66
16 1 661
17 1 128
18 3 504
19 3 561
20 1 443
21 5 375
22 3 249
23 1 246
24 0 0
25 1 28
26 0 0
27 1 13
28 1 27
29 1 33
30 1 56
31 1 101
32 1 31
33 1 35
34 1 35
35 0 0
36 1 81
37 1 131
38 1 88
39 1 128
40 1 147
41 1 111
42 1 133
Number of Paragraphs:43


['Please verify your email address.',
 'You’ve reached your account maximum for followed topics.',
 "This article contains spoilers from One Piece's Elbaf Arc.",
 'One Piece chapter 1130 proved to be absolutely stunning from start to finish and it is safe to say that this chapter shocked the fans to quite an extent with the reveal of Loki so early into the arc. Truly, few were expecting Loki to be revealed in such fashion in One Piece chapter 1130, and at the same time, even fewer individuals were expecting him to be as important as he has proven to be.',
 'Loki is perhaps the most powerful giant. His strength might be on par with a Yonko in One Piece.',
 'In this chapter, fans even got what can be considered to be a buildup of sorts towards a fight between Luffy, who is the Sun God Nika, and Loki, who also believes himself to be the Sun God. It looks like there is going to be a clash of Sun Gods after One Piece chapter 1130 and it is certainly going to be one worth watching, provided 

# NLP Vectorizers

In [292]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## CountVectorizer

In [293]:
cv = CountVectorizer()
# cv = CountVectorizer(stop_words='english')
# cv = CountVectorizer(stop_words='english', min_df=3) # ignore words that appear in only 'n' documents.
# cv = CountVectorizer(stop_words='english',max_df=0.5) # ignore words that don't appear in at least half of the documents
cv.fit(para_text)
cv.get_feature_names_out()[:]


array(['1130', '20', '2024', 'abilities', 'about', 'absolute',
       'absolutely', 'account', 'actually', 'address', 'after', 'against',
       'ahead', 'all', 'along', 'already', 'also', 'an', 'and',
       'announces', 'another', 'anticipating', 'app', 'arc', 'are',
       'arise', 'armored', 'around', 'article', 'as', 'at', 'available',
       'back', 'bandai', 'battle', 'battlegrounds', 'be', 'because',
       'been', 'began', 'being', 'believe', 'believes', 'below', 'best',
       'better', 'between', 'big', 'bit', 'body', 'both', 'bothering',
       'bounties', 'brash', 'break', 'buildup', 'but', 'by', 'called',
       'calm', 'calming', 'can', 'cannot', 'captain', 'care', 'certainly',
       'chains', 'challenge', 'challenging', 'change', 'changes',
       'chapter', 'character', 'characters', 'claim', 'claiming', 'clash',
       'clear', 'clearly', 'cliffhanger', 'combat', 'come', 'comes',
       'comment', 'confident', 'considered', 'consoles', 'contains',
       'core', 'cou

In [294]:
counts = cv.transform(para_text)
counts, print(f'Counts Shape: {counts.shape}')

Counts Shape: (43, 492)


(<43x492 sparse matrix of type '<class 'numpy.int64'>'
 	with 1052 stored elements in Compressed Sparse Row format>,
 None)

In [295]:
print(counts[:,:5]) # sparse matrix (row,col) coordinates with count value

  (7, 0)	2
  (9, 0)	1
  (10, 0)	1
  (13, 3)	1
  (13, 4)	1
  (14, 0)	1
  (18, 4)	2
  (23, 0)	1
  (23, 1)	1
  (23, 2)	1


In [296]:
counts.toarray()[:10,:22] # sparse matrix format

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0]])

In [297]:
cv_df = pd.DataFrame(counts.toarray(),columns=cv.get_feature_names_out())
cv_df.iloc[4:10,:]

Unnamed: 0,1130,20,2024,abilities,about,absolute,absolutely,account,actually,address,...,world,worshiped,worth,worthy,would,years,yonko,you,your,zoro
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [298]:
# Aggregate the counts across all documents for each term
total_counts = cv_df.sum(axis=0)
total_counts_df = total_counts.reset_index()
total_counts_df.columns = ['Term', 'Total Count']
total_counts_df.sort_values(by='Total Count', ascending=False).reset_index(drop=True).head(10)



Unnamed: 0,Term,Total Count
0,the,92
1,to,50
2,and,43
3,of,35
4,is,34
5,loki,32
6,that,26
7,be,26
8,one,23
9,this,23


In [299]:
para_word_count = cv_df.sum(axis=1)
para_wc_df = para_word_count.reset_index()
para_wc_df.columns = ['Paragraph', 'Word Count']
para_wc_df.sort_values(by='Word Count', ascending=False).head(10)

Unnamed: 0,Paragraph,Word Count
10,10,127
16,16,124
12,12,120
14,14,99
19,19,94
18,18,91
20,20,83
13,13,79
7,7,76
9,9,71


In [300]:
para_wc_df['Word Count'].describe()

count     43.000000
mean      33.162791
std       38.842742
min        0.000000
25%        5.000000
50%       14.000000
75%       58.000000
max      127.000000
Name: Word Count, dtype: float64

In [301]:
tot_wc = para_wc_df['Word Count'].sum(axis=0)
print(f'Total Number of Words in Article:\n{tot_wc}')

Total Number of Words in Article:
1426


In [303]:
# words per minute read time estimator:
#upper/lower bound vector
ublb = np.array([1/200, 1/250]) # conversation factor between 200 to 250 words per minute
art = tot_wc * ublb
art = art[::-1]
print(art.mean())
print(f'Estimated reading time from {round(art[0],1)} to {round(art[1],1)} minutes')

6.417
Estimated reading time from 5.7 to 7.1 minutes


# Estimating Read Times
Take a random sample of length/complexity documents and time the users read time for each document. Model read time against length/complexity.

Test for random repeat documents. How does a second reading within a certain time period effect read time?