## CSV and Excel files - Structured Data

In [1]:
import pandas as pd
import os
from langchain_community.document_loaders import UnstructuredExcelLoader, CSVLoader

In [3]:
# Create Sample Data
data = {
    "Product": ['Laptop', 'Mouse', 'Keyboard', 'Monitor','Webcam'],
    'Category': ['Electronics', 'Accessories', 'Accessoriec','Electronics','Electronics'],
    'Price': [999.99, 29.99, 79.99, 299.99, 89.99],
    'Stock': [50, 200, 150, 75, 100],
    'Description': [
        'hig-Performance laptop with 16GB RAM and 512GB SSD',
        'Wireless optical mouse with ergonomic design',
        'Mechanical keyboard with RGB backlighting',
        '27-inch 4K monitor with HDR support',
        '1080p webcam with noise cancellation'
    ]
}

# Save as CSV
df = pd.DataFrame(data)
df.to_csv("data/structured_files/products.csv")

## CSV Processing

In [4]:
from langchain_community.document_loaders import CSVLoader, UnstructuredCSVLoader

In [5]:
# Method 1: Using CSVLoader
print("Using CSVLoader")

try:
    csvloader = CSVLoader(
        "data/structured_files/products.csv",
        encoding='utf-8',
        csv_args={
            'delimiter':',',
            'quotechar': '"',
        }
    )
    csv_docs = csvloader.load()

    print(f"Loaded {len(csv_docs)} documents (one per row)")
    print("\nFirst document:")
    print(f"Content: {csv_docs[0].page_content}")
    print(f"Metadata: {csv_docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

Using CSVLoader
Loaded 5 documents (one per row)

First document:
Content: : 0
Product: Laptop
Category: Electronics
Price: 999.99
Stock: 50
Description: hig-Performance laptop with 16GB RAM and 512GB SSD
Metadata: {'source': 'data/structured_files/products.csv', 'row': 0}


In [7]:
# Method 1: Using CSVLoader
print("Using UnstructuredCSVLoader")

try:
    u_csvloader = UnstructuredCSVLoader(
        "data/structured_files/products.csv",
        mode="elements",
        unstructured_csv_args={
            'delimiter':',',
            'quotechar': '"',
            'encoding': 'utf-8',
        }
    )
    csv_docs = u_csvloader.load()

    for i, doc in enumerate(csv_docs[:3]):
        print(f"\nElement {i+1}:")
        print(f"Type: {doc.metadata.get('category', 'unknown')}")
        print(f"Content: {doc.page_content[:100]}...")
except Exception as e:
    print(f"Error: {e}")

Using UnstructuredCSVLoader

Element 1:
Type: Table
Content: Product Category Price Stock Description 0.0 Laptop Electronics 999.99 50 hig-Performance laptop wit...


## Excel processing

In [16]:
from typing import List
from langchain_core.documents import Document

# Method 1: Using pandas for full control
print("Pandas based excel processing")
def process_excel_with_pandas(filepath: str) -> List[Document]:
    """Process Excel with shee awareness"""
    
    documents = []
    
    # Real all sheets
    excel_file = pd.ExcelFile(filepath)
    
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(filepath, sheet_name=sheet_name)
        
        # Create document for each sheet
        sheet_content = f"Sheet: {sheet_name}\n"
        sheet_content += f"Columns: {', '.join(df.columns)}\n"
        sheet_content += f"Rows: {len(df)}\n\n"
        sheet_content += df.to_string(index=False)
        
        doc = Document(
            page_content=sheet_content,
            metadata={
                "source": filepath,
                "sheet_name": sheet_name,
                "num_rows": len(df),
                "num_columns": len(df.columns),
                "data_type": "excel_sheet",
            }                            
        )
        documents.append(doc)
        
    return documents

Pandas based excel processing


In [17]:
excel_docs = process_excel_with_pandas("data/structured_files/inventory.xlsx")
print(f"Processed {len(excel_docs)} sheets")

Processed 2 sheets


In [22]:
## Using UnstrcuturedExcelLoader
print("\nUsing UnstructuredExcelloader")

excel_loader = UnstructuredExcelLoader(
    "data/structured_files/inventory.xlsx",
    mode="elements"
)

docs = excel_loader.load()
docs


Using UnstructuredExcelloader


[Document(metadata={'source': 'data/structured_files/inventory.xlsx', 'file_directory': 'data/structured_files', 'filename': 'inventory.xlsx', 'last_modified': '2025-07-07T14:26:30', 'page_name': 'Products', 'page_number': 1, 'text_as_html': '<table><tr><td>Product</td><td>Category</td><td>Price</td><td>Stock</td><td>Description</td></tr><tr><td>Laptop</td><td>Electronics</td><td>999.99</td><td>50</td><td>High-performance laptop with 16GB RAM and 512GB SSD</td></tr><tr><td>Mouse</td><td>Accessories</td><td>29.99</td><td>200</td><td>Wireless optical mouse with ergonomic design</td></tr><tr><td>Keyboard</td><td>Accessories</td><td>79.99</td><td>150</td><td>Mechanical keyboard with RGB backlighting</td></tr><tr><td>Monitor</td><td>Electronics</td><td>299.99</td><td>75</td><td>27-inch 4K monitor with HDR support</td></tr><tr><td>Webcam</td><td>Electronics</td><td>89.99</td><td>100</td><td>1080p webcam with noise cancellation</td></tr></table>', 'languages': ['eng'], 'filetype': 'applicatio