### Data Ingestion

In [6]:
###Document structure
from langchain_core.documents import Document



In [7]:
doc=Document(page_content="This is the content of the document.", 
metadata={
    "source": "example.txt",
    "pages": 10,
    "author": "John Doe",
    "date_created": "2026-01-15"
    }
)
doc

Document(metadata={'source': 'example.txt', 'pages': 10, 'author': 'John Doe', 'date_created': '2026-01-15'}, page_content='This is the content of the document.')

In [8]:
import os

# Define the file path
file_path = os.path.join(os.getcwd(), "text_files/test_document.txt")

# Check if the file exists
if os.path.exists(file_path):
    print(f"File found at: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
    
    # Read the file content
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Create a Document object with the file content
    doc = Document(
        page_content=content,
        metadata={
            "source": file_path,
            "file_name": os.path.basename(file_path),
            "directory": os.path.dirname(file_path),
            "file_size": os.path.getsize(file_path)
        }
    )
    
    print("\n--- Document Object ---")
    print(f"Page Content:\n{doc.page_content}")
    print(f"\nMetadata: {doc.metadata}")
else:
    print(f"File not found at: {file_path}")


File found at: e:\Programming\ML\jupyter-notebooks\notebook\text_files/test_document.txt
File size: 333 bytes

--- Document Object ---
Page Content:
This is a sample document for testing the Document class.
It contains multiple lines of text.
The Document class from langchain_core can process this content.
We can also store metadata about this document such as source, author, and creation date.
This file is created to demonstrate basic file operations using the os module.


Metadata: {'source': 'e:\\Programming\\ML\\jupyter-notebooks\\notebook\\text_files/test_document.txt', 'file_name': 'test_document.txt', 'directory': 'e:\\Programming\\ML\\jupyter-notebooks\\notebook\\text_files', 'file_size': 333}


In [12]:
### TextLoaders
from langchain_community.document_loaders import TextLoader

loader=TextLoader("./text_files/test_document.txt", encoding="utf-8")
document=loader.load()
print(document)

[Document(metadata={'source': './text_files/test_document.txt'}, page_content='This is a sample document for testing the Document class.\nIt contains multiple lines of text.\nThe Document class from langchain_core can process this content.\nWe can also store metadata about this document such as source, author, and creation date.\nThis file is created to demonstrate basic file operations using the os module.\n')]


In [13]:
### Directory Loader
from langchain_community.document_loaders import DirectoryLoader
##load all the text files from the directory
dir_loader=DirectoryLoader(
    "./text_files",
    glob="**/*.txt",
    loader_cls= TextLoader,
    loader_kwargs={"encoding":"utf-8"},
    show_progress=False
)

documents=dir_loader.load()
documents


[Document(metadata={'source': 'text_files\\data_science.txt'}, page_content='Data Science and Analytics\n\nData science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract meaningful information from data.\n\nKey Components of Data Science:\n1. Data Collection - Gathering data from various sources\n2. Data Cleaning - Preparing and validating data quality\n3. Exploratory Data Analysis - Understanding data patterns and relationships\n4. Statistical Analysis - Applying statistical methods to draw conclusions\n5. Machine Learning Modeling - Building predictive models\n6. Data Visualization - Presenting insights through visual representations\n\nTools and Technologies:\n- Python with libraries like pandas, scikit-learn, and matplotlib\n- R for statistical computing\n- SQL for database queries\n- Spark for big data processing\n- Tableau and Power BI for visualization\n\nCareer Path:\nData scientists combine programming skills, statistical 

In [18]:
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader

##load all the text files from the directory
dir_loader=DirectoryLoader(
    "./pdf_files",
    glob="**/*.pdf",
    loader_cls= PyMuPDFLoader,
    show_progress=False
)

pdf_documents=dir_loader.load()
pdf_documents


[Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': 'anonymous', 'creationdate': '2026-01-30T17:23:53+05:00', 'source': 'pdf_files\\document1.pdf', 'file_path': 'pdf_files\\document1.pdf', 'total_pages': 1, 'format': 'PDF 1.3', 'title': 'untitled', 'author': 'anonymous', 'subject': 'unspecified', 'keywords': '', 'moddate': '2026-01-30T17:23:53+05:00', 'trapped': '', 'modDate': "D:20260130172353+05'00'", 'creationDate': "D:20260130172353+05'00'", 'page': 0}, page_content='Machine Learning Fundamentals\nThis is the first dummy PDF document.\nIt contains information about machine learning basics.\nMachine learning algorithms learn patterns from data.\nCommon types: Supervised, Unsupervised, Reinforcement Learning.\nApplications include image recognition, NLP, and recommendations.\nDeep learning uses neural networks for complex tasks.'),
 Document(metadata={'producer': 'ReportLab PDF Library - (opensource)', 'creator': 'anonymous', 'creationdate': '2026-01-30

In [19]:
type(pdf_documents[0])

langchain_core.documents.base.Document