<a href="https://colab.research.google.com/github/gmsarti/FairyTalesNLP/blob/main/Tales2DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tales2DB
Neste experimento contos de fadas vão passar por um pipeline de processamento de linguagem natural (NLP).

Parts of the process
*   Scrape tales
*   Extract information
*   Save data in database
*   Create original tales and save them
*   Make an web app with Streamlit
<br />
<br />
---
<br />

The process of extracting information is central to this process, so it will be the first developed.


In [None]:
# IMPORTS

import requests, sqlite3, pandas as pd, spacy, random
from bs4 import BeautifulSoup
from typing import List
from pydantic import BaseModel
from pprint import pprint

In [None]:
# Load SpaCy model for NER
nlp = spacy.load("en_core_web_sm")

class TaleInfo(BaseModel):
    tale_name: str
    tale_story: str
    characters: List[str]
    animals: List[str]
    monsters: List[str]
    setting: List[str]
    theme: str
    category: str
    country_of_origin: str
    author: str

    @classmethod
    def from_text(cls, tale_text: str):
        # Process the tale text with SpaCy NER
        doc = nlp(tale_text)

        # Extract entities for characters, animals, monsters, and settings
        characters = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        animals = [ent.text for ent in doc.ents if ent.label_ == "ANIMAL"]
        monsters = [ent.text for ent in doc.ents if ent.label_ == "MONSTER"]
        setting = [ent.text for ent in doc.ents if ent.label_ == "LOCATION"]

        # Instantiate the TaleInfo class with extracted information
        tale_info = cls(
            tale_name="Unknown",
            tale_story=tale_text,
            characters=characters,
            animals=animals,
            monsters=monsters,
            setting=setting,
            theme="Unknown",
            category="Unknown",
            country_of_origin="Unknown",
            author="Unknown",
        )

        return tale_info

# Example usage:
tale_text_example = """
Once upon a time, in a kingdom far, far away, there lived a brave knight named Sir Lancelot.
He embarked on a quest to defeat the fire-breathing dragon that terrorized the land.
Accompanying him were his loyal steed, Thunder, and the wise wizard Merlin.
"""

tale_info = TaleInfo.from_text(tale_text_example)

# Accessing the extracted information
pprint(tale_info.dict())


{'animals': [],
 'author': 'Unknown',
 'category': 'Unknown',
 'characters': ['Lancelot', 'Merlin'],
 'country_of_origin': 'Unknown',
 'monsters': [],
 'setting': [],
 'tale_name': 'Unknown',
 'tale_story': '\n'
               'Once upon a time, in a kingdom far, far away, there lived a '
               'brave knight named Sir Lancelot. \n'
               'He embarked on a quest to defeat the fire-breathing dragon '
               'that terrorized the land. \n'
               'Accompanying him were his loyal steed, Thunder, and the wise '
               'wizard Merlin.\n',
 'theme': 'Unknown'}


Some of the desired categories in NER did not bring results and even *Merlin*'s name did not appear. Let's fine-tune the NER model to add the new categories.

In [9]:
import random

# Lists of entities
monsters_and_creatures = [
    "Dragon", "Witch", "Goblin", "Troll", "Werewolf", "Vampire", "Zombie", "Ghost", "Ogre", "Cyclops",
    "Banshee", "Minotaur", "Griffin", "Harpy", "Chimera", "Basilisk", "Sphinx", "Kelpie", "Siren", "Cerberus",
    "Kraken", "Medusa", "Djinn", "Fairy", "Elf", "Dwarf", "Giant", "Mermaid", "Centaur", "Pegasus", "Unicorn",
    "Phoenix", "Leprechaun", "Changeling", "Kobold", "Nymph", "Satyr", "Succubus", "Incubus", "Banshee",
    "Boggart", "Imp", "Hobgoblin", "Will-o'-the-Wisp", "Yeti", "Chupacabra", "Manticore", "Wendigo", "Baba Yaga"
]

places_and_settlements = [
    "Castle", "Kingdom", "Forest", "Cottage", "Enchanted Garden", "Village", "Town", "City", "Mountain",
    "Valley", "Cave", "Lake", "River", "Island", "Desert", "Palace", "Mansion", "Tower", "Bridge", "Well",
    "Meadow", "Moor", "Labyrinth", "Marketplace", "School of Magic", "Wishing Well", "Witch's Hut", "Haunted House",
    "Underground Kingdom", "Underwater City", "Floating Island", "Fairy Ring", "Secret Garden", "Ice Palace",
    "Gingerbread House", "Cloud Castle", "Golden City", "Lost City", "Oasis", "Forbidden City", "Whimsical Wonderland",
    "Sky Kingdom", "Dark Forest", "Crystal Caves", "Dreamscape", "Starlit Observatory", "Timeless Realm", "Celestial Citadel"
]

common_animals_in_fairy_tales = [
    "Wolf", "Bear", "Lion", "Monkey", "Elephant", "Cheetah", "Penguin", "Eagle", "Giraffe", "Whale", "Kangaroo",
    "Bee", "Panda", "Tiger", "Crocodile", "Koala", "Butterfly", "Frog", "Owl", "Mouse", "Cat", "Dog", "Horse",
    "Donkey", "Rabbit", "Squirrel", "Fox", "Deer", "Hedgehog", "Turtle", "Snail", "Dragonfly", "Swan", "Fish",
    "Shark", "Octopus", "Crab", "Dolphin", "Seagull", "Peacock", "Parrot", "Snake", "Spider", "Bat", "Caterpillar",
    "Ladybug", "Ant", "Hummingbird"
]

# Function to create training examples
def create_training_examples(entity_list, label):
    examples = []
    sentence_templates = [
        "The {0} was seen near the {1}.",
        "{1} is home to the {0}.",
        "The {0} is a common character in fairy tales and is often depicted as {2}.",
        "The {0} and the {3} lived in harmony in the {1}.",
        "{0} is known for its association with {2} in the {1}.",
        "In the land of {1}, the {0} is a legendary creature.",
        "Legends speak of the {0} residing in the enchanted {1}.",
        "Once upon a time, the {0} roamed freely in the {1}.",
        "A brave {2} encountered the {0} in the {1}.",
        "The {0} guarded the {1} with its magical powers.",
        "The {0} and the {3} were friends in the mystical {1}.",
        "In the heart of the {1}, a {2} met the {0}.",
        "Children often dream of meeting the {0} in the {1}.",
        "The {0} was known to dwell near the {1}.",
        "The {0} was a fearsome creature in the {1}.",
        "Legends tell of the {0}'s presence in the {1}.",
        "Once upon a time, the {0} and the {3} coexisted in the {1}.",
        "The magical {1} was home to the {0}.",
        "A {2} encountered the {0} in the {1}."
    ]

    for entity in entity_list:
        # Generate 5 random sentences for each entity
        for _ in range(5):
            sentence_template = random.choice(sentence_templates)
            sentence = sentence_template.format(entity, random.choice(places_and_settlements), random.choice(monsters_and_creatures), random.choice(common_animals_in_fairy_tales))

            # Extract positions for all occurrences of the entity in the sentence
            entities_positions = [(i, i + len(entity), label) for i in range(len(sentence)) if sentence.startswith(entity, i)]

            examples.append((sentence, {"entities": entities_positions}))

    return examples

# Create training data for each category
character_examples = create_training_examples(monsters_and_creatures, "CHARACTER")
animal_examples = create_training_examples(common_animals_in_fairy_tales, "ANIMAL")
monster_examples = create_training_examples(monsters_and_creatures, "MONSTER")
setting_examples = create_training_examples(places_and_settlements, "SETTING")

# Combine all examples
train_data = character_examples + animal_examples + monster_examples + setting_examples

# Shuffle the training data
random.shuffle(train_data)

# Print a sample of the training data
for example in train_data[:10]:
    print(example)


('The Kobold and the Butterfly were friends in the mystical Oasis.', {'entities': [(4, 10, 'CHARACTER')]})
('Children often dream of meeting the Kobold in the Enchanted Garden.', {'entities': [(36, 42, 'MONSTER')]})
('In the land of Cloud Castle, the Dragon is a legendary creature.', {'entities': [(33, 39, 'MONSTER')]})
('The Lion was seen near the Valley.', {'entities': [(4, 8, 'ANIMAL')]})
('The Banshee was a fearsome creature in the Floating Island.', {'entities': [(4, 11, 'MONSTER')]})
('A brave Changeling encountered the Snake in the Underwater City.', {'entities': [(35, 40, 'ANIMAL')]})
('Celestial Citadel is home to the Seagull.', {'entities': [(33, 40, 'ANIMAL')]})
('Lost City is home to the Wendigo.', {'entities': [(25, 32, 'MONSTER')]})
('Once upon a time, the Manticore roamed freely in the Haunted House.', {'entities': [(22, 31, 'MONSTER')]})
('The magical Town was home to the Fox.', {'entities': [(33, 36, 'ANIMAL')]})


In [10]:
print(len(train_data))
train_data[55:60]

970


[("The Witch's Hut was known to dwell near the Sky Kingdom.",
  {'entities': [(4, 15, 'SETTING')]}),
 ('The Snail and the Cat were friends in the mystical Golden City.',
  {'entities': [(4, 9, 'ANIMAL')]}),
 ('The Chimera was known to dwell near the Valley.',
  {'entities': [(4, 11, 'CHARACTER')]}),
 ('The Leprechaun is a common character in fairy tales and is often depicted as Phoenix.',
  {'entities': [(4, 14, 'CHARACTER')]}),
 ('The Incubus was known to dwell near the Lost City.',
  {'entities': [(4, 11, 'MONSTER')]})]

In [None]:
train_data = [
    ("There is a lion in the savannah.", {"entities": [(15, 19, "ANIMAL")]}),
    ("The monkey swung from tree to tree.", {"entities": [(4, 10, "ANIMAL")]}),
    ("A group of elephants gathered at the watering hole.", {"entities": [(12, 20, "ANIMAL")]}),
    ("The cheetah is the fastest land animal.", {"entities": [(4, 10, "ANIMAL")]}),
    ("Penguins waddle on the icy shores.", {"entities": [(0, 7, "ANIMAL")]}),
    ("The eagle soared high in the sky.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Giraffes have long necks and distinctive spotted coats.", {"entities": [(0, 7, "ANIMAL")]}),
    ("The whale breached the surface of the ocean.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Kangaroos are known for their powerful hind legs.", {"entities": [(0, 8, "ANIMAL")]}),
    ("A swarm of bees buzzed around the hive.", {"entities": [(9, 12, "ANIMAL")]}),
    ("The panda peacefully ate bamboo in the bamboo forest.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Tigers are majestic creatures with distinctive stripes.", {"entities": [(0, 6, "ANIMAL")]}),
    ("The crocodile lurked in the murky waters.", {"entities": [(4, 14, "ANIMAL")]}),
    ("The koala slept in the eucalyptus tree.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Butterflies fluttered among the flowers.", {"entities": [(0, 9, "ANIMAL")]}),
    # Add more examples with different animals
]


In [None]:
ner = nlp.get_pipe("ner")
ner.add_label("ANIMAL")


from spacy.training.example import Example

# Fine-tune the model
epochs = 10
for epoch in range(epochs):
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5)  # Set drop to a value between 0.0 and 1.0 for dropout regularization




In [None]:
nlp.to_disk("fine_tuned_model")

In [None]:
fine_tuned_nlp = spacy.load("fine_tuned_model")

# Example usage
text = "The elephant is a majestic creature."
doc = fine_tuned_nlp(text)

# Extract entities
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
pprint(entities)


[('elephant', 4, 12, 'ANIMAL')]


In [None]:
doc = fine_tuned_nlp(tale_text_example)

# Extract entities
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
pprint(entities)


[('Lancelot', 84, 92, 'ANIMAL'),
 ('Thunder', 220, 227, 'ANIMAL'),
 ('Merlin', 249, 255, 'ANIMAL')]


In [None]:
def create_database():
    # Connect to SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect('fairy_tales.db')
    cursor = conn.cursor()

    # Create a table to store fairy tales if it doesn't exist
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS fairy_tales (
            id INTEGER PRIMARY KEY,
            title TEXT,
            content TEXT
        )
    ''')

    # Commit changes and close the connection
    conn.commit()
    conn.close()

In [None]:
def save_fairy_tale_to_database(title, content):
    # Connect to the SQLite database
    conn = sqlite3.connect('fairy_tales.db')
    cursor = conn.cursor()

    # Insert the fairy tale into the database
    cursor.execute('INSERT INTO fairy_tales (title, content) VALUES (?, ?)', (title, content))

    # Commit changes and close the connection
    conn.commit()
    conn.close()

## Fine Tune Spacy for animals, monsters and settings.

In [None]:
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('fairy_tales.db')

# Query to retrieve all data from the "tales" table
query = 'SELECT * FROM fairy_tales'

# Use pandas to read the data into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the DataFrame
print(df)


Empty DataFrame
Columns: [id, title, content]
Index: []
