### Data Ingestion

In [1]:
### Document Structure

from langchain_core.documents import Document

In [2]:
doc = Document(
    page_content="This is the main content I am using to create a RAG file.",
    metadata={
        "source": "example.txt", 
        "pages": 1, 
        "author": "Damilola Adekoya",
        "date_created": "2024-06-15"
    }
)

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Damilola Adekoya', 'date_created': '2024-06-15'}, page_content='This is the main content I am using to create a RAG file.')

In [3]:
## create a simple txt file

import os
os.makedirs("../data/text_files", exist_ok=True)

In [4]:
sample_text = {
    "../data/text_files/keys.txt": """This is the content of sample text file 1.
    Arsenal is my favorite football club.
    I love programming in Python.
    I enjoy learning about AI and machine learning.

    Key Types:
    Company Key
        - Used by Company-level Clients (e.g., Travelden, Finchglow, etc.)
        - Enables access to endpoints for managing agencies, branches, currencies, and      consolidated records.


    Agency Key
        - Used by Agency-level Clients (e.g., Touch Down Travels).
        - Enables agency-level access (e.g., their bookings, flight rules, and customers).



    Middleware Logic:
    The middleware currently:
        - Detects the type of key (Company or Agency).
        - Determines authorization scope.
        - Routes accordingly to service endpoints.

    """,

    "../data/text_files/arsenal.txt": """
        "The Year the Gunners Remembered"

        The season began with a whisper and a glare — the Emirates shimmering under a London sun that seemed to promise something more this time. Mikel Arteta’s Arsenal had come close before — heartbreakingly close — and the ghosts of “almost” still hovered in the stands. But this year, something felt different.

        Declan Rice ran the midfield like a general with unfinished business, Martin Ødegaard painted passes like poetry, and Bukayo Saka… well, he was still Saka — fearless, smiling, and impossible to catch. Gabriel and Saliba turned defense into art, while the young blood from Hale End carried that old-school fire that fans hadn’t seen since the Invincibles.

        There were battles, of course — a scrappy draw at Old Trafford, a late winner at Anfield, and that night at the Etihad when they refused to bow. Each match stitched a new thread into the fabric of belief. The Emirates roared louder, the chants carried longer, and even neutrals began to admit — Arsenal were not just back; they were becoming something else entirely.

        By spring, they stood at the summit. The same script that had broken them before — fatigue, pressure, doubt — tried to creep back in. But this time, they didn’t blink. When the final whistle blew on the last day, the red shirts stood tall, drenched in confetti and disbelief. After two decades of waiting, the Gunners were champions again.

        Arteta smiled — not wide, not wild — just that quiet grin of a man who’d seen his plan finally bloom.
        Arsenal weren’t just a team anymore.
        They were a statement: Beautiful football can still win everything.
    """
}

for file_path, content in sample_text.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created.")

Sample text files created.


In [6]:
### reading text using text loader from langchain
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/text_files/keys.txt", encoding="utf-8")
document = loader.load()

print(document)

[Document(metadata={'source': '../data/text_files/keys.txt'}, page_content='This is the content of sample text file 1.\n    Arsenal is my favorite football club.\n    I love programming in Python.\n    I enjoy learning about AI and machine learning.\n\n    Key Types:\n    Company Key\n        - Used by Company-level Clients (e.g., Travelden, Finchglow, etc.)\n        - Enables access to endpoints for managing agencies, branches, currencies, and      consolidated records.\n\n\n    Agency Key\n        - Used by Agency-level Clients (e.g., Touch Down Travels).\n        - Enables agency-level access (e.g., their bookings, flight rules, and customers).\n\n\n\n    Middleware Logic:\n    The middleware currently:\n        - Detects the type of key (Company or Agency).\n        - Determines authorization scope.\n        - Routes accordingly to service endpoints.\n\n    ')]


In [9]:
### Directory Loader to load all text files in a directory
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "../data/text_files", 
    glob="**/*.txt", 
    loader_cls=TextLoader, 
    loader_kwargs={"encoding": "utf-8"},
    # show_progress=True
)

documents = dir_loader.load()
print(documents)
print(f"Loaded {len(documents)} documents.")

[Document(metadata={'source': '../data/text_files/keys.txt'}, page_content='This is the content of sample text file 1.\n    Arsenal is my favorite football club.\n    I love programming in Python.\n    I enjoy learning about AI and machine learning.\n\n    Key Types:\n    Company Key\n        - Used by Company-level Clients (e.g., Travelden, Finchglow, etc.)\n        - Enables access to endpoints for managing agencies, branches, currencies, and      consolidated records.\n\n\n    Agency Key\n        - Used by Agency-level Clients (e.g., Touch Down Travels).\n        - Enables agency-level access (e.g., their bookings, flight rules, and customers).\n\n\n\n    Middleware Logic:\n    The middleware currently:\n        - Detects the type of key (Company or Agency).\n        - Determines authorization scope.\n        - Routes accordingly to service endpoints.\n\n    '), Document(metadata={'source': '../data/text_files/arsenal.txt'}, page_content='\n        "The Year the Gunners Remembered"\

In [10]:
### Directory Loader to load all pdf files in a directory
from langchain_community.document_loaders import PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf", 
    glob="**/*.pdf", 
    loader_cls=PyMuPDFLoader, 
    show_progress=False
)

pdf_documents = dir_loader.load()
print(pdf_documents)
print(f"Loaded {len(pdf_documents)} documents.")

[Document(metadata={'producer': 'dompdf 2.0.3 + CPDF', 'creator': '', 'creationdate': '2025-10-10T16:31:36+01:00', 'source': '../data/pdf/ticket.pdf', 'file_path': '../data/pdf/ticket.pdf', 'total_pages': 1, 'format': 'PDF 1.7', 'title': 'Whatadeal | Ticket Details', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-10-10T16:31:36+01:00', 'trapped': '', 'modDate': "D:20251010163136+01'00'", 'creationDate': "D:20251010163136+01'00'", 'page': 0}, page_content='W\nWorld Usability Day Africa 2025\nHackathon: The Prompt is You!\nGeneral Admission\nWORBFZC780\nSlack Channel\nVirtual\nFriday, 17 October 2025 09:00 AM\nOrdered by Damilola Adekoya\non Oct 10, 2025 4:31 PM'), Document(metadata={'producer': 'Skia/PDF m143 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '../data/pdf/Damilola Adekoya CV 3.pdf', 'file_path': '../data/pdf/Damilola Adekoya CV 3.pdf', 'total_pages': 6, 'format': 'PDF 1.4', 'title': 'Damilola Adekoya CV 3', 'author': '', 'subject': '', 'k

In [16]:
### Directory Loader to load all csv files in a directory
from langchain_community.document_loaders import CSVLoader

dir_loader = DirectoryLoader(
    "../data/csv", 
    glob="**/*.csv", 
    loader_cls=CSVLoader, 
    show_progress=False
)

csv_documents = dir_loader.load()
print(csv_documents)
print(f"Loaded {len(csv_documents)} csvs.")

Loaded 2990 csvs.


In [23]:
### Directory Loader to load all excel files in a directory
# from langchain_community.document_loaders import UnstructuredExcelLoader

# excel_loader = DirectoryLoader(
#     "../data/excel", 
#     glob="**/*.xlsx", 
#     loader_cls=UnstructuredExcelLoader, 
#     show_progress=False,
# )

# excel_documents = excel_loader.load()
# print(excel_documents)
# print(f"Loaded {len(excel_documents)} excel files.")

Error loading file ../data/excel/correct data.xlsx


ModuleNotFoundError: No module named 'msoffcrypto'