In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import random
import json

## 1. Load the children's book data

In [2]:
data = pd.read_csv('children_books.csv')
print(data.shape)
display(data.head(5))

(3269, 5)


Unnamed: 0,Title,Author,Desc,Inerest_age,Reading_age
0,The Girl Who Drank the Moon,Kelly Barnhill,"Every year, the evil Protectorate offers a bab...",10-14,10-14
1,Time Between Us,Tamara Ireland Stone,Sixteen-year-old Anna is struggling to underst...,13+,12+
2,Girl Out of Water,Nat Luurtsema,Lou Brown's swimming ambitions sank without tr...,13-18,10+
3,Captive,A J Grainger,Robyn is scared. Ever since the attempted assa...,13+,13
4,The School of Music,Rachel Bowen and Meurig Bowen Illustrator: Dan...,Welcome to the School of Music. In charge is M...,10+,10+


In [22]:
data.sample(5)

Unnamed: 0,title,author,desc,inerest_age,reading_age,tok,tok_top
8,I Capture the Castle,Dodie Smith,"Originally published in 1949, this delightful ...",12+,12+,"[originally published in 1949, this delightful...","[(0, [originally published in 1949, this delig..."
6,Dark Inside,Jeyn Roberts,After a huge earthquake strikes every continen...,14+,10+,[after a huge earthquake strikes every contine...,"[(0, [after a huge earthquake strikes every co..."
21,Holding Up the Universe,Jennifer Niven,Jennifer Niven's deeply compelling and highly ...,14+,12+,"[and just as powerfully written, the book is n...","[(0, [and just as powerfully written]), (3, [t..."
26,Five Hundred Miles,Kevin Brooks,"In the industrial wastelands of east London, t...",13+,9+,"[in the industrial wastelands of east london, ...","[(0, [in the industrial wastelands of east lon..."
16,The Song from Somewhere Else,AF Harrold\nIllustrator: Levi Pinfold,"Frank is being bullied - for what, she's not s...",8+,8+,"[frank is being bullied, for what, she's not s...","[(0, [frank is being bullied]), (6, [and an ex..."


In [3]:
data.columns = [i.lower() for i in data.columns]
data.loc[0,'desc']

'Every year, the evil Protectorate offers a baby to the evil witch in the forest to ensure their village’s safety. Yet, unknown to them, the witch is Xan, the plump and knowledgeable protector of the babies that she takes to the neighbouring village to be raised as special children, fed on starlight and destined to do wonderful things.\nYet, one day, Xan feeds one baby the moon by mistake and fills her with a huge amount of magic\xa0– a dangerous amount, as it turns out\xa0– so Xan decides to raise Luna as her own. But nothing is straightforward, and when Luna turns 13, everything will change.\nKelly Barnhill won the Newbery Medal in the US for this thoughtful and magical book about belonging, fear and family, and it’s easy to see why. The writing flows beautifully and Barnhill’s keen observation of human nature fits naturally into the story, which moves along at a satisfying pace.\nXan is a delightfully pleasant and caring crone witch who dispels the traditional image of cackling evil

In [4]:
print(data['title'].duplicated().sum())
print(data[~data['title'].duplicated(keep='first')]['title'].duplicated().sum())

3228
0


In [5]:
data = data[~data['title'].duplicated(keep='first')].reset_index(drop=True)

In [6]:
print(data.shape)
display(data.head())

(41, 5)


Unnamed: 0,title,author,desc,inerest_age,reading_age
0,The Girl Who Drank the Moon,Kelly Barnhill,"Every year, the evil Protectorate offers a bab...",10-14,10-14
1,Time Between Us,Tamara Ireland Stone,Sixteen-year-old Anna is struggling to underst...,13+,12+
2,Girl Out of Water,Nat Luurtsema,Lou Brown's swimming ambitions sank without tr...,13-18,10+
3,Captive,A J Grainger,Robyn is scared. Ever since the attempted assa...,13+,13
4,The School of Music,Rachel Bowen and Meurig Bowen Illustrator: Dan...,Welcome to the School of Music. In charge is M...,10+,10+


## 2. Tokenize the text data

In [7]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stopwords = set (stopwords.words("english"))

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jay.je/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jay.je/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def get_word_freq(x):
    x = ' '.join(x)
    x = x.lower()
    for word in nltk.word_tokenize(x):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

In [10]:
def split_sentences(x):

    s = x['desc']
    author = str(x['author']).lower().strip()
    title = str(x['title']).lower().strip()
    s = s.replace('\xa0',' ').replace('.','<DELIM>').replace(';', '<DELIM>').replace(':','<DELIM>').replace(' – ', '<DELIM>').replace(' - ', '<DELIM>')
    sents = s.split('<DELIM>')
    sents = [str(i).lower().strip('\n').strip() for i in sents][:-1]
    sents = [i for i in sents if author not in i and i != '' and title not in i]
    
    return sents

In [11]:

def extract_key_sentences(ls_sent):
    sentences_scores={}
    scores_to_sentences={}

    if len(ls_sent)<5:
        return

    for idx, sent in enumerate(ls_sent):
        words = nltk.word_tokenize(sent.lower())
        for word in words:
            if word in word_frequencies:
                if idx not in sentences_scores:
                    sentences_scores[idx] = []
                sentences_scores[idx].append(word_frequencies[word])
        sentences_scores[idx] = np.mean(sentences_scores[idx])
    
    # take the lowest sentences
    vals = sorted(list(sentences_scores.values()))[:5]
    top_sentences = {0: nltk.sent_tokenize(ls_sent[0])
                    #  len(ls_sent): nltk.sent_tokenize(ls_sent[-2]),
                     }
    
    for idx, score in sentences_scores.items():
        if score in vals and idx not in top_sentences:
            top_sentences[idx] = nltk.sent_tokenize(ls_sent[idx])
            if len(top_sentences.keys())==5:
                break

    return sorted(top_sentences.items())


In [24]:
word_frequencies = {}
data['tok'] = data.apply(lambda x: split_sentences(x), axis=1)
data['tok'].apply(lambda x: get_word_freq(x))
word_frequencies['every']

4

In [25]:
data.loc[10,'tok']

['when poppy sinclair finds the body of a young woman washed up on the shore of a lake, she is convinced it is a case of murder rather than the tragic accident or suicide which is the theory favoured by police',
 'as poppy struggles with memories of her own near-death experience and the pain of seeing the boy she loves with another girl, she finds it difficult to draw clear lines between reality and imagination',
 'especially as people and things are not always what they seem',
 'yet despite the risks, she is determined to prove that someone close by is a murderer',
 'this is an engaging, complex and  thoughtful thriller, featuring a plot thick with lies, secrets, deceit and jealousy']

In [26]:
max_freq = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/max_freq)

In [27]:
data['tok_top'] = data['tok'].apply(lambda x: extract_key_sentences(x))

In [28]:
data_clean = list(data['tok_top'])

In [21]:
display(data_clean[5])
display(data_clean[10])
display(data_clean[30])

[(0, ['caz and will live in a settlement for survivors of the frozen world']),
 (1,
  ['caz hasn’t seen her father for nine years, when the blue star poisoned and froze the planet']),
 (3,
  ['caz and will soon find themselves out in the frozen landscape once again, and it’s even more dangerous than last time']),
 (4,
  ['this is a fantastic sequel for this dystopian series, keeping the fast pace of at the world’s end']),
 (5, ['it will certainly leave the reader asking for more'])]

[(0,
  ['when poppy sinclair finds the body of a young woman washed up on the shore of a lake, she is convinced it is a case of murder rather than the tragic accident or suicide which is the theory favoured by police']),
 (1,
  ['as poppy struggles with memories of her own near-death experience and the pain of seeing the boy she loves with another girl, she finds it difficult to draw clear lines between reality and imagination']),
 (2, ['especially as people and things are not always what they seem']),
 (3,
  ['yet despite the risks, she is determined to prove that someone close by is a murderer']),
 (4,
  ['this is an engaging, complex and  thoughtful thriller, featuring a plot thick with lies, secrets, deceit and jealousy'])]

[(0,
  ['farway gaius mccarthy was born in the future, in the past and out of time']),
 (3, ['what he doesn’t realise is that the glitch was planned']),
 (4, ['someone is playing him']),
 (5, ['someone who is a lot closer to him than he could ever have imagined']),
 (7,
  ['graudin weaves an exciting plot where time travel is just the beginning'])]

In [29]:
children_books_data = {}

for idx, ann in enumerate(data_clean):
    if ann: # if the annotation exists
        children_books_data[idx] = {}
        excerpt = [i[1][0].lower() for i in ann]
        for seq, txt in enumerate(excerpt):
            children_books_data[idx][seq] = txt

In [30]:
children_books_data

{0: {0: 'every year, the evil protectorate offers a baby to the evil witch in the forest to ensure their village’s safety',
  1: 'so xan decides to raise luna as her own',
  2: 'the writing flows beautifully and barnhill’s keen observation of human nature fits naturally into the story, which moves along at a satisfying pace',
  3: 'xan is a delightfully pleasant and caring crone witch who dispels the traditional image of cackling evildoer, and replaces it with something far more loving and magical',
  4: 'perfect for young fans of neil gaiman, ursula le guin and diana wynne jones'},
 1: {0: 'sixteen-year-old anna is struggling to understand bennett, the new boy at school',
  1: 'one minute he seems interested in her and she feels they have a connection',
  2: 'when he saves her life, she finally discovers his secret and all the pieces of the puzzle fall into place',
  3: 'anna realises they should never have been able to meet, let alone fall in love, because although he is a year older

In [31]:
data[~data['title'].duplicated(keep='first')].shape

(41, 7)

In [32]:
fin = json.dumps(children_books_data, indent=4)
with open("children_books_clean.json", "w") as f:
    f.write(fin)
    f.close()

### Count the number of tokens

In [33]:
# Total tokens
total_count = 0
for k in children_books_data:
    for s in children_books_data[k]:
        total_count += len(children_books_data[k][s].split(' '))

In [34]:
total_count # this is the minimum number of tokens sent to chatGPT

3087

In [37]:
for i in children_books_data[40].values():
    print(i)

mare barrow knows she is different
we follow mare as she tries to escape from the king who wants to control her and her ability
the reader sees mare's inner struggle as she tries to understand if she is right to recruit the others with abilities similar to hers, release the prisoners of the silver king, and unite with the red guard
the characters now wage political and physical battles
if you enjoyed the hunger games and the maze runner, then the red queen series will not disappoint
