In [42]:
import os
import pandas as pd
from typing import List,Dict,Any
from langchain_core.documents import Document

In [43]:
from langchain_community.document_loaders import CSVLoader,UnstructuredCSVLoader

In [44]:
csv_loader=CSVLoader(
    file_path='data/csv/products.csv',
    encoding='utf-8',
    csv_args={
        'delimiter': ',',
        'quotechar': '"',
    }
)
csv_docs=csv_loader.load()
print(f"Loaded {len(csv_docs)} documents")
print("\nFirst document")
print(f"Content : {csv_docs[0].page_content}")
print(f"Metadata : {csv_docs[0].metadata}")

Loaded 5 documents

First document
Content : Product: Laptop
Category: Electronics
Price: 999.99
Stock: 50
Description: High-performance laptop with 16GB RAM and 512GB SSD
Metadata : {'source': 'data/csv/products.csv', 'row': 0}


In [45]:
def process_csv_intelligent(filepath:str) -> List[Document]:
    """Process Csv with intelligent document creation"""
    df=pd.read_csv(filepath)
    documents=[]
    for idx,row in df.iterrows():
        content = f""" Product information
                Name: {row.name},
                Category: {row.Category},
                Description: {row.Description},
                Price: {row.Price},
            """
        doc=Document(
            page_content=content,
            metadata={
                'source':filepath,
                'language':'en',
                'row_index':idx,
                'product_name':row.name,
                'product_category':row.Category,
                'product_description':row.Description,
                'product_price':row.Price,
            }
        )
        documents.append(doc)
    return documents

In [46]:
info=process_csv_intelligent('data/csv/products.csv')

In [47]:
for i in range(len(info)):
    print(info[i])

page_content=' Product information
                Name: 0,
                Category: Electronics,
                Description: High-performance laptop with 16GB RAM and 512GB SSD,
                Price: 999.99,
            ' metadata={'source': 'data/csv/products.csv', 'language': 'en', 'row_index': 0, 'product_name': 0, 'product_category': 'Electronics', 'product_description': 'High-performance laptop with 16GB RAM and 512GB SSD', 'product_price': 999.99}
page_content=' Product information
                Name: 1,
                Category: Accessories,
                Description: Wireless optical mouse with ergonomic design,
                Price: 29.99,
            ' metadata={'source': 'data/csv/products.csv', 'language': 'en', 'row_index': 1, 'product_name': 1, 'product_category': 'Accessories', 'product_description': 'Wireless optical mouse with ergonomic design', 'product_price': 29.99}
page_content=' Product information
                Name: 2,
                Category: Access

In [62]:
def process_excel_with_pandas(filepath:str) -> List[Document]:
    documents=[]
    excel_file=pd.ExcelFile(filepath, engine='openpyxl')
    for sheet_name in excel_file.sheet_names:
        df=pd.read_excel(excel_file, sheet_name=sheet_name)
        sheet_content=f"sheet_name={sheet_name}"
        sheet_content+=f"Columns: {' '.join(df.columns)}"
        sheet_content+=f"Rows: {len(df)}"
        sheet_content+=f"Content: {df.to_string(index=False)}"

        doc =Document(
            page_content=sheet_content,
            metadata={
                'source':filepath,
            }
        )

        documents.append(doc)
    return documents


In [66]:
print(process_excel_with_pandas('data/csv/inventory.xlsx'))

[Document(metadata={'source': 'data/csv/inventory.xlsx'}, page_content='sheet_name=ProductsColumns: Product Category Price Stock DescriptionRows: 5Content:  Product    Category  Price  Stock                                         Description\n  Laptop Electronics 999.99     50 High-performance laptop with 16GB RAM and 512GB SSD\n   Mouse Accessories  29.99    200        Wireless optical mouse with ergonomic design\nKeyboard Accessories  79.99    150           Mechanical keyboard with RGB backlighting\n Monitor Electronics 299.99     75                 27-inch 4K monitor with HDR support\n  Webcam Electronics  89.99    100                1080p webcam with noise cancellation'), Document(metadata={'source': 'data/csv/inventory.xlsx'}, page_content='sheet_name=SummaryColumns: Category Total_Items Total_ValueRows: 2Content:    Category  Total_Items  Total_Value\nElectronics            3      1389.97\nAccessories            2       109.98')]


In [68]:
def unstructured_loading(file_path:str) -> List[Document]:
    documents=[]
    unstructured_csv=UnstructuredCSVLoader(
        file_path=file_path,
        mode='elements'
    )
    return unstructured_csv


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xde in position 16: invalid continuation byte