<a href="https://colab.research.google.com/github/gmsarti/FairyTalesNLP/blob/main/Tales2DB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests, sqlite3, pandas as pd, spacy
from bs4 import BeautifulSoup
from typing import List
from pydantic import BaseModel

In [None]:
def create_database():
    # Connect to SQLite database (or create it if it doesn't exist)
    conn = sqlite3.connect('fairy_tales.db')
    cursor = conn.cursor()

    # Create a table to store fairy tales if it doesn't exist
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS fairy_tales (
            id INTEGER PRIMARY KEY,
            title TEXT,
            content TEXT
        )
    ''')

    # Commit changes and close the connection
    conn.commit()
    conn.close()

In [None]:
def save_fairy_tale_to_database(title, content):
    # Connect to the SQLite database
    conn = sqlite3.connect('fairy_tales.db')
    cursor = conn.cursor()

    # Insert the fairy tale into the database
    cursor.execute('INSERT INTO fairy_tales (title, content) VALUES (?, ?)', (title, content))

    # Commit changes and close the connection
    conn.commit()
    conn.close()

In [None]:
# !python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg

2024-01-29 19:23:32.376276: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-29 19:23:32.376356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-29 19:23:32.378633: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully install

In [None]:
from typing import List
from pydantic import BaseModel
import spacy
from pprint import pprint

# Load SpaCy model for NER
nlp = spacy.load("en_core_web_lg")

class TaleInfo(BaseModel):
    tale_name: str
    tale_story: str
    characters: List[str]
    animals: List[str]
    monsters: List[str]
    setting: List[str]
    theme: str
    category: str
    country_of_origin: str
    author: str

    @classmethod
    def from_text(cls, tale_text: str):
        # Process the tale text with SpaCy NER
        doc = nlp(tale_text)

        # Extract entities for characters, animals, monsters, and settings
        characters = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        animals = [ent.text for ent in doc.ents if ent.label_ == "ANIMAL"]
        monsters = [ent.text for ent in doc.ents if ent.label_ == "MONSTER"]
        setting = [ent.text for ent in doc.ents if ent.label_ == "LOCATION"]

        # Instantiate the TaleInfo class with extracted information
        tale_info = cls(
            tale_name="Unknown",
            tale_story=tale_text,
            characters=characters,
            animals=animals,
            monsters=monsters,
            setting=setting,
            theme="Unknown",
            category="Unknown",
            country_of_origin="Unknown",
            author="Unknown",
        )

        return tale_info

# Example usage:
tale_text_example = """
Once upon a time, in a kingdom far, far away, there lived a brave knight named Sir Lancelot.
He embarked on a quest to defeat the fire-breathing dragon that terrorized the land.
Accompanying him were his loyal steed, Thunder, and the wise wizard Merlin.
"""

tale_info = TaleInfo.from_text(tale_text_example)

# Accessing the extracted information
pprint(tale_info.dict())


{'animals': [],
 'author': 'Unknown',
 'category': 'Unknown',
 'characters': ['Lancelot', 'Merlin'],
 'country_of_origin': 'Unknown',
 'monsters': [],
 'setting': [],
 'tale_name': 'Unknown',
 'tale_story': '\n'
               'Once upon a time, in a kingdom far, far away, there lived a '
               'brave knight named Sir Lancelot. \n'
               'He embarked on a quest to defeat the fire-breathing dragon '
               'that terrorized the land. \n'
               'Accompanying him were his loyal steed, Thunder, and the wise '
               'wizard Merlin.\n',
 'theme': 'Unknown'}


## Fine Tune Spacy for animals, monsters and settings.

In [None]:
train_data = [
    ("There is a lion in the savannah.", {"entities": [(15, 19, "ANIMAL")]}),
    ("The monkey swung from tree to tree.", {"entities": [(4, 10, "ANIMAL")]}),
    ("A group of elephants gathered at the watering hole.", {"entities": [(12, 20, "ANIMAL")]}),
    ("The cheetah is the fastest land animal.", {"entities": [(4, 10, "ANIMAL")]}),
    ("Penguins waddle on the icy shores.", {"entities": [(0, 7, "ANIMAL")]}),
    ("The eagle soared high in the sky.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Giraffes have long necks and distinctive spotted coats.", {"entities": [(0, 7, "ANIMAL")]}),
    ("The whale breached the surface of the ocean.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Kangaroos are known for their powerful hind legs.", {"entities": [(0, 8, "ANIMAL")]}),
    ("A swarm of bees buzzed around the hive.", {"entities": [(9, 12, "ANIMAL")]}),
    ("The panda peacefully ate bamboo in the bamboo forest.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Tigers are majestic creatures with distinctive stripes.", {"entities": [(0, 6, "ANIMAL")]}),
    ("The crocodile lurked in the murky waters.", {"entities": [(4, 14, "ANIMAL")]}),
    ("The koala slept in the eucalyptus tree.", {"entities": [(4, 9, "ANIMAL")]}),
    ("Butterflies fluttered among the flowers.", {"entities": [(0, 9, "ANIMAL")]}),
    # Add more examples with different animals
]


In [None]:
ner = nlp.get_pipe("ner")
ner.add_label("ANIMAL")


from spacy.training.example import Example

# Fine-tune the model
epochs = 10
for epoch in range(epochs):
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5)  # Set drop to a value between 0.0 and 1.0 for dropout regularization




In [None]:
nlp.to_disk("fine_tuned_model")

In [None]:
fine_tuned_nlp = spacy.load("fine_tuned_model")

# Example usage
text = "The elephant is a majestic creature."
doc = fine_tuned_nlp(text)

# Extract entities
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
pprint(entities)


[('elephant', 4, 12, 'ANIMAL')]


In [None]:
doc = fine_tuned_nlp(tale_text_example)

# Extract entities
entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
pprint(entities)


[('Lancelot', 84, 92, 'ANIMAL'),
 ('Thunder', 220, 227, 'ANIMAL'),
 ('Merlin', 249, 255, 'ANIMAL')]


In [None]:
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('fairy_tales.db')

# Query to retrieve all data from the "tales" table
query = 'SELECT * FROM fairy_tales'

# Use pandas to read the data into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Display the DataFrame
print(df)


Empty DataFrame
Columns: [id, title, content]
Index: []
