In [2]:
import os
import re
import pandas as pd
import numpy as np
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import json
from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup
import gdelt # for gdelt searchs
from gkg_tools import * # for gkg searchs
# %run "../gkg_tools.py" # using magic command run to access the script from the parent directory


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


from transformers import pipeline, set_seed
import torch

# GPU Timing (using GPU 1) else -1 for CPU
device_id = 1 if torch.cuda.is_available() else -1 

here


# Get Article from GDELT

In [3]:
gkg = gkg_operator() # create a gkg operator
gkg.set_date(['2024 Oct 20', '2024 Oct 21']) # set the date range for the search
gkg.get_gkg()
persons = ['Eiichiro Oda', 'Shueisha', 'One Piece']
OP = gkg.gkg_persons(persons)
OP.reset_index(inplace=True)

In [196]:
url = OP['documentidentifier'].iloc[0] # screen rant using index 1 and 8
url

# I tried to suppress this warning which occurs when querying from the lambda2 server, didn't work, so I dropped the warning code.

'https://gamerant.com/one-piece-sun-god-loki-nika-luffy/'

In [189]:

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    
session = requests.Session()
retry = Retry(connect=2, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

try:
    response = session.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.RequestException as e:
        print(f"Error: {e}")

## Headers

In [190]:
headers = soup.find_all('h1')
header_text = []
for i in range(1,5):
    headers = soup.find_all(f'h{i}')
    for h in headers:
        header_text.append(h.get_text())
    
header_text

['Game Rant',
 'One Piece Chapter 1130: Oda Finally Introduces Loki',
 ' The Straw Hats Continue To Sail It To Elbaf ',
 ' The Lore About The Land Of The Giants ',
 ' Luffy Meets Loki in One Piece 1130 ',
 'Will Loki Have A Zoan Devil Fruit?',
 'Related',
 'Key Takeaways',
 ' The Giants Of Elbaf Had A King Named Harald ',
 ' Luffy and The Accursed Prince Meet ',
 'Trending Now']

## Paragraphs

In [191]:
paragraphs = soup.find_all('p')
paragraphs[7]

<p><em><strong>One Piece</strong></em> chapter 1130 is out officially, and fans are thoroughly blown away by what can be described to be one of the best chapters in the entire series. This week's chapter was absolutely stunning from start to finish, <a href="https://gamerant.com/one-piece-best-post-timeskip-feat-every-straw-hat/" target="_blank">as the Straw Hat Pirates finally explored the real Elbaf</a>.</p>

In [192]:
para_text = []
# for idx, p in enumerate(paragraphs[4:14]):
for idx, p in enumerate(paragraphs):
    print(idx, len(p), len(p.get_text())) # use enumerate to track line number visually
    para_text.append(p.get_text())

print(f'Number of Paragraphs:{len(para_text)}')
para_text[4:26]

0 1 28
1 0 0
2 1 13
3 1 27
4 1 33
5 1 58
6 1 58
7 4 278
8 1 119
9 1 348
10 2 605
11 1 50
12 3 515
13 1 510
14 1 738
15 3 662
16 1 84
17 1 516
18 1 116
19 2 482
20 5 542
21 3 495
22 3 715
23 1 247
24 0 0
25 1 28
26 0 0
27 1 13
28 1 27
29 1 33
30 1 58
31 1 101
32 1 31
33 0 0
34 1 47
35 1 108
36 1 116
37 1 145
38 1 145
39 1 146
40 1 134
41 1 138
Number of Paragraphs:42


['Please verify your email address.',
 'Youâ€™ve reached your account maximum for followed topics.',
 "This article contains spoilers from One Piece's Elbaf Arc.",
 "One Piece chapter 1130 is out officially, and fans are thoroughly blown away by what can be described to be one of the best chapters in the entire series. This week's chapter was absolutely stunning from start to finish, as the Straw Hat Pirates finally explored the real Elbaf.",
 "One Piece fans should be excited about Oda's two major projects during the manga break. Here's what to look forward to.",
 "Quite a lot of interesting things, such as the lore about Elbaf and its former King, were revealed in this chapter, and at the same time, the character that fans were anticipating to see the most in this arc, Loki, made his appearance as well. Finally, things are underway in this arc of majestic proportions, and it can't get any better than this.",
 'One Piece chapter 1130 saw the Straw Hat Pirates finally break out of Rodo

In [193]:
# Initialize list to hold filtered text
para_text = []

# Define start and end markers for gamerant.com
start_marker = "You’ve reached your account maximum"
end_marker = "One Piece is available to read via Viz Media."

# Define start and end markers for screenrant.com
# start_marker = "You’ve reached your account maximum"
# end_marker = "Created by Eiichiro Oda"

# Flags to track whether we're in the section we want
in_section = False

# Iterate through each paragraph
for idx, p in enumerate(paragraphs):
    text = p.get_text(strip=True)  # Extract and strip whitespace around text

    # Start capturing paragraphs after the start_marker
    if start_marker in text:
        in_section = True
        continue  # Skip the marker paragraph itself

    # Stop capturing when we hit the end_marker
    if end_marker in text:
        break

    # Append paragraph if within the desired section
    if in_section and len(text) > 0:  # Example length filter
        para_text.append(text)

# Show results
print(f'Number of Paragraphs: {len(para_text)}')
for idx, text in enumerate(para_text):
    print(f"Paragraph {idx + 1}: {text}")


Number of Paragraphs: 0


# NLP Vectorizers

In [194]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## CountVectorizer

In [195]:
cv = CountVectorizer()
# cv = CountVectorizer(stop_words='english')
# cv = CountVectorizer(stop_words='english', min_df=3) # ignore words that appear in only 'n' documents.
# cv = CountVectorizer(stop_words='english',max_df=0.5) # ignore words that don't appear in at least half of the documents
cv.fit(para_text)
cv.get_feature_names_out()[:]


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
counts = cv.transform(para_text)
counts, print(f'Counts Shape: {counts.shape}')

Counts Shape: (25, 488)


(<25x488 sparse matrix of type '<class 'numpy.int64'>'
 	with 781 stored elements in Compressed Sparse Row format>,
 None)

### Counts Sparse Matrix

In [None]:
print(counts[:,:5]) # sparse matrix (row,col) coordinates with count value

  (0, 0)	1
  (0, 1)	1
  (2, 0)	1
  (6, 3)	1
  (7, 2)	1
  (17, 4)	1


In [None]:
counts.toarray()[:10,:22] # sparse matrix format

array([[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

### Counts DateFrame

In [None]:
cv_df = pd.DataFrame(counts.toarray(),columns=cv.get_feature_names_out())
cv_df.iloc[4:10,:]

Unnamed: 0,1130,1130one,2023,abilities,accidently,according,accursed,act,action,actually,...,willing,with,within,won,world,worst,years,yet,you,your
4,0,0,0,0,0,1,1,2,0,0,...,0,1,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
7,0,0,1,0,0,0,0,0,1,0,...,0,4,0,0,1,0,1,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Aggregate the counts across all documents for each term
total_counts = cv_df.sum(axis=0)
total_counts_df = total_counts.reset_index()
total_counts_df.columns = ['Term', 'Total Count']
total_counts_df.sort_values(by='Total Count', ascending=False).reset_index(drop=True).head(10)



Unnamed: 0,Term,Total Count
0,the,53
1,to,25
2,of,24
3,and,22
4,loki,20
5,is,19
6,his,16
7,in,15
8,as,15
9,luffy,14


In [None]:
para_word_count = cv_df.sum(axis=1)
para_wc_df = para_word_count.reset_index()
para_wc_df.columns = ['Paragraph', 'Word Count']
para_wc_df.sort_values(by='Word Count', ascending=False).head(10)

Unnamed: 0,Paragraph,Word Count
2,2,115
4,4,105
1,1,101
7,7,97
0,0,78
5,5,60
6,6,60
16,16,41
17,17,38
15,15,37


In [None]:
para_wc_df['Word Count'].describe()

count     25.000000
mean      38.600000
std       34.795833
min        3.000000
25%       14.000000
50%       25.000000
75%       60.000000
max      115.000000
Name: Word Count, dtype: float64

### Find Total Word Count

In [None]:
tot_wc = para_wc_df['Word Count'].sum(axis=0)
print(f'Total Number of Words in Article:\n{tot_wc}')

Total Number of Words in Article:
965


### Estimating Read Time

In [None]:
# words per minute read time estimator:
#upper/lower bound vector
ublb = np.array([1/200, 1/250]) # conversation factor between 200 to 250 words per minute
art = tot_wc * ublb
art = art[::-1]
print(art.mean())
print(f'Estimated reading time from {round(art[0],1)} to {round(art[1],1)} minutes')

4.3425
Estimated reading time from 3.9 to 4.8 minutes


## TfidfVectorizer
TF: Term Frequency

IDF: Inverse Term Frequency


In [None]:
tv = TfidfVectorizer(stop_words='english')
f = tv.fit_transform(para_text)
tva = pd.DataFrame(f.toarray(),columns=tv.get_feature_names_out())
tva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Columns: 375 entries, 1130 to years
dtypes: float64(375)
memory usage: 73.4 KB


In [None]:
tva.sum(axis=0).sort_values(ascending=False).head(10)

loki       2.004649
email      1.840183
luffy      1.528530
sent       1.479987
saved      1.326531
chapter    0.821103
series     0.819336
comment    0.748384
changes    0.748384
land       0.694053
dtype: float64

In [None]:
tv.idf_.shape, tv.idf_.max(), tv.idf_.min()

((375,), 3.5649493574615367, 2.0608719606852626)

# Estimating Read Times Experiment 
Take a random sample of length/complexity documents and time the users read time for each document. Model read time against length/complexity.

Test for random repeat documents. How does a second reading within a certain time period effect read time?