In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import random
import json

## 1. Load the children's book data

In [2]:
data = pd.read_csv('../datasets/childrenBook/children_books.csv')
print(data.shape)
display(data.head(5))

(3269, 5)


Unnamed: 0,Title,Author,Desc,Inerest_age,Reading_age
0,The Girl Who Drank the Moon,Kelly Barnhill,"Every year, the evil Protectorate offers a bab...",10-14,10-14
1,Time Between Us,Tamara Ireland Stone,Sixteen-year-old Anna is struggling to underst...,13+,12+
2,Girl Out of Water,Nat Luurtsema,Lou Brown's swimming ambitions sank without tr...,13-18,10+
3,Captive,A J Grainger,Robyn is scared. Ever since the attempted assa...,13+,13
4,The School of Music,Rachel Bowen and Meurig Bowen Illustrator: Dan...,Welcome to the School of Music. In charge is M...,10+,10+


In [3]:
data.sample(5)

Unnamed: 0,Title,Author,Desc,Inerest_age,Reading_age
1859,Five Hundred Miles,Kevin Brooks,"In the industrial wastelands of east London, t...",13+,9+
1245,Buffalo Soldier,Tanya Landman,Plantation slave Charley Smith is eleven when ...,13+,11+
2635,Paper Avalanche,Lisa Williamson,Fourteen-year-old Ro Snow’s got a secret: her ...,12+,12+
480,The School of Music,Rachel Bowen and Meurig Bowen Illustrator: Dan...,Welcome to the School of Music. In charge is M...,10+,10+
3160,Time Between Us,Tamara Ireland Stone,Sixteen-year-old Anna is struggling to underst...,13+,12+


In [4]:
data.columns = [i.lower() for i in data.columns]
data.loc[0,'desc']

'Every year, the evil Protectorate offers a baby to the evil witch in the forest to ensure their village’s safety. Yet, unknown to them, the witch is Xan, the plump and knowledgeable protector of the babies that she takes to the neighbouring village to be raised as special children, fed on starlight and destined to do wonderful things.\nYet, one day, Xan feeds one baby the moon by mistake and fills her with a huge amount of magic\xa0– a dangerous amount, as it turns out\xa0– so Xan decides to raise Luna as her own. But nothing is straightforward, and when Luna turns 13, everything will change.\nKelly Barnhill won the Newbery Medal in the US for this thoughtful and magical book about belonging, fear and family, and it’s easy to see why. The writing flows beautifully and Barnhill’s keen observation of human nature fits naturally into the story, which moves along at a satisfying pace.\nXan is a delightfully pleasant and caring crone witch who dispels the traditional image of cackling evil

In [5]:
print(data['title'].duplicated().sum())
print(data[~data['title'].duplicated(keep='first')]['title'].duplicated().sum())

3228
0


In [6]:
data = data[~data['title'].duplicated(keep='first')].reset_index(drop=True)

In [7]:
print(data.shape)
display(data.head())

(41, 5)


Unnamed: 0,title,author,desc,inerest_age,reading_age
0,The Girl Who Drank the Moon,Kelly Barnhill,"Every year, the evil Protectorate offers a bab...",10-14,10-14
1,Time Between Us,Tamara Ireland Stone,Sixteen-year-old Anna is struggling to underst...,13+,12+
2,Girl Out of Water,Nat Luurtsema,Lou Brown's swimming ambitions sank without tr...,13-18,10+
3,Captive,A J Grainger,Robyn is scared. Ever since the attempted assa...,13+,13
4,The School of Music,Rachel Bowen and Meurig Bowen Illustrator: Dan...,Welcome to the School of Music. In charge is M...,10+,10+


## 2. Clean the text data

In [8]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [9]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/jay.je/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jay.je/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
stopwords = set(stopwords.words("english"))

In [11]:
def get_word_freq(x):
    x = ' '.join(x)
    x = x.lower()
    for word in nltk.word_tokenize(x):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

In [12]:
def split_sentences(x):

    s = x['desc']
    author = str(x['author']).lower().strip()
    title = str(x['title']).lower().strip()
    s = s.replace('\n','<DELIM>').replace('\xa0',' ').replace('.','<DELIM>').replace(';', '<DELIM>').replace(':','<DELIM>').replace(' – ', '<DELIM>').replace(' - ', '<DELIM>')
    sents = s.split('<DELIM>')
    sents = [str(i).lower().strip('\n').strip() for i in sents][:-1]
    sents = [i for i in sents if author not in i and i != '' and title not in i]
    
    return sents

In [13]:

def extract_key_sentences(ls_sent):
    sentences_scores={}
    scores_to_sentences={}

    if len(ls_sent)<5:
        return

    for idx, sent in enumerate(ls_sent):
        words = nltk.word_tokenize(sent.lower())
        for word in words:
            if word in word_frequencies:
                if idx not in sentences_scores:
                    sentences_scores[idx] = []
                sentences_scores[idx].append(word_frequencies[word])
        sentences_scores[idx] = np.mean(sentences_scores[idx])
    
    # take the lowest sentences
    vals = sorted(list(sentences_scores.values()))[-5:]
    top_sentences = {0: nltk.sent_tokenize(ls_sent[0])
                    #  len(ls_sent): nltk.sent_tokenize(ls_sent[-2]),
                     }
    
    for idx, score in sentences_scores.items():
        if score in vals and idx not in top_sentences:
            top_sentences[idx] = nltk.sent_tokenize(ls_sent[idx])
            if len(top_sentences.keys())==5:
                break

    return sorted(top_sentences.items())


In [14]:
word_frequencies = {}
data['tok'] = data.apply(lambda x: split_sentences(x), axis=1)
data['tok'].apply(lambda x: get_word_freq(x))
word_frequencies['every']

4

In [15]:
data.loc[10,'tok']

['when poppy sinclair finds the body of a young woman washed up on the shore of a lake, she is convinced it is a case of murder rather than the tragic accident or suicide which is the theory favoured by police',
 'as poppy struggles with memories of her own near-death experience and the pain of seeing the boy she loves with another girl, she finds it difficult to draw clear lines between reality and imagination',
 'especially as people and things are not always what they seem',
 'yet despite the risks, she is determined to prove that someone close by is a murderer',
 'this is an engaging, complex and  thoughtful thriller, featuring a plot thick with lies, secrets, deceit and jealousy']

In [16]:
max_freq = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/max_freq)

In [17]:
data['tok_top'] = data['tok'].apply(lambda x: extract_key_sentences(x))

In [18]:
data_clean = list(data['tok_top'])

In [19]:
display(data_clean[5])
display(data_clean[10])
display(data_clean[30])

[(0, ['caz and will live in a settlement for survivors of the frozen world']),
 (1,
  ['caz hasn’t seen her father for nine years, when the blue star poisoned and froze the planet']),
 (2,
  ['she can’t let go of the feeling that he may still be alive somewhere, and when she finds a secret file containing information about him, she’s determined to investigate']),
 (3,
  ['caz and will soon find themselves out in the frozen landscape once again, and it’s even more dangerous than last time']),
 (4,
  ['this is a fantastic sequel for this dystopian series, keeping the fast pace of at the world’s end'])]

[(0,
  ['when poppy sinclair finds the body of a young woman washed up on the shore of a lake, she is convinced it is a case of murder rather than the tragic accident or suicide which is the theory favoured by police']),
 (1,
  ['as poppy struggles with memories of her own near-death experience and the pain of seeing the boy she loves with another girl, she finds it difficult to draw clear lines between reality and imagination']),
 (2, ['especially as people and things are not always what they seem']),
 (3,
  ['yet despite the risks, she is determined to prove that someone close by is a murderer']),
 (4,
  ['this is an engaging, complex and  thoughtful thriller, featuring a plot thick with lies, secrets, deceit and jealousy'])]

[(0,
  ['farway gaius mccarthy was born in the future, in the past and out of time']),
 (1,
  ['in a world where time travel is a possible career, he has a bright future (and past) in front of him']),
 (2,
  ['he doesn’t just want it, he needs it, because his mother vanished on the time travelling ship ab eterno, and he wants to find her more than anything']),
 (6,
  ['what is at stake is more than his career, more than his mother, and even more than his life']),
 (8,
  ['her characters are spunky, smart-mouthed, tormented, emotionally yearning and contemporary'])]

In [20]:
children_books_data = {}

for idx, ann in enumerate(data_clean):
    if ann: # if the annotation exists
        children_books_data[idx] = {}
        excerpt = [i[1][0].lower() for i in ann]
        for seq, txt in enumerate(excerpt):
            children_books_data[idx][seq] = txt

In [21]:
children_books_data

{0: {0: 'every year, the evil protectorate offers a baby to the evil witch in the forest to ensure their village’s safety',
  1: 'yet, unknown to them, the witch is xan, the plump and knowledgeable protector of the babies that she takes to the neighbouring village to be raised as special children, fed on starlight and destined to do wonderful things',
  2: 'yet, one day, xan feeds one baby the moon by mistake and fills her with a huge amount of magic',
  3: 'a dangerous amount, as it turns out',
  4: 'but nothing is straightforward, and when luna turns 13, everything will change'},
 1: {0: 'sixteen-year-old anna is struggling to understand bennett, the new boy at school',
  1: 'but the next, he avoids her and acts as if he hardly knows her',
  2: 'when he saves her life, she finally discovers his secret and all the pieces of the puzzle fall into place',
  3: 'anna realises they should never have been able to meet, let alone fall in love, because although he is a year older than her, be

In [22]:
data[~data['title'].duplicated(keep='first')].shape

(41, 7)

In [25]:
len(children_books_data.keys())

35

In [23]:
fin = json.dumps(children_books_data, indent=4)
with open("/home/jay.je/IMspiredStoryTelling/datasets/childrenBook/children_books_clean.json", "w") as f:
    f.write(fin)
    f.close()

### Count the number of tokens

In [26]:
# Total tokens
total_count = 0
for k in children_books_data:
    for s in children_books_data[k]:
        total_count += len(children_books_data[k][s].split(' '))

In [27]:
total_count # this is the minimum number of tokens sent to chatGPT

3465

In [28]:
for i in children_books_data[40].values():
    print(i)

mare barrow knows she is different
her blood is red, but she has an ability that only someone with silver blood should have
the reader sees mare's inner struggle as she tries to understand if she is right to recruit the others with abilities similar to hers, release the prisoners of the silver king, and unite with the red guard
the characters now wage political and physical battles
if you enjoyed the hunger games and the maze runner, then the red queen series will not disappoint
