In [1]:
import langchain

In [2]:
langchain.__version__

'0.2.6'

# Document

In [3]:
from langchain_core.documents import Document

In [24]:
data = Document(page_content='This is the article about document loaders of Langchain', id=1, metadata={'source':'AV'})

In [17]:
data = Document(page_content=21, id=1, metadata={'source':'AV'})

In [21]:
data.page_content = 43

In [26]:
data

Document(id='1', metadata={'source': 'AV'}, page_content='This is the article about document loaders of Langchain')

In [28]:
data.page_content

'This is the article about document loaders of Langchain'

In [29]:
data.id = 2

In [30]:
data

Document(id=2, metadata={'source': 'AV'}, page_content='This is the article about document loaders of Langchain')

In [12]:
data.dict()

{'id': '1',
 'metadata': {'source': 'AV'},
 'page_content': 'This is the article about document loaders of Langchain',
 'type': 'Document'}

In [16]:
data.schema()

{'title': 'Document',
 'description': 'Class for storing a piece of text and associated metadata.\n\nExample:\n\n    .. code-block:: python\n\n        from langchain_core.documents import Document\n\n        document = Document(\n            page_content="Hello, world!",\n            metadata={"source": "https://example.com"}\n        )',
 'type': 'object',
 'properties': {'id': {'title': 'Id', 'type': 'string'},
  'metadata': {'title': 'Metadata', 'type': 'object'},
  'page_content': {'title': 'Page Content', 'type': 'string'},
  'type': {'title': 'Type',
   'default': 'Document',
   'enum': ['Document'],
   'type': 'string'}},
 'required': ['page_content']}

# CSV

In [308]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [309]:
file_path = "./iris.csv"

loader = CSVLoader(file_path=file_path, source_column='species', metadata_columns=['species'], csv_args={"delimiter": ","})

In [313]:
loader = CSVLoader(file_path="./iris.csv", metadata_columns=['species'], csv_args={"delimiter": ","})

In [314]:
data = loader.load()

In [315]:
len(data)

150

In [312]:
data[0].metadata

{'source': 'setosa', 'row': 0, 'species': 'setosa'}

In [316]:
data[0].metadata

{'source': './iris.csv', 'row': 0, 'species': 'setosa'}

In [317]:
data[0].dict()

{'id': None, 'metadata': {'source': './iris.csv', 'row': 0, 'species': 'setosa'}, 'page_content': 'sepal_length: 5.1\nsepal_width: 3.5\npetal_length: 1.4\npetal_width: 0.2', 'type': 'Document'}

In [318]:
data[130].dict()

{'id': None, 'metadata': {'source': './iris.csv', 'row': 130, 'species': 'virginica'}, 'page_content': 'sepal_length: 7.4\nsepal_width: 2.8\npetal_length: 6.1\npetal_width: 1.9', 'type': 'Document'}

In [319]:
for record in data[:2]:
    print(record)

page_content='sepal_length: 5.1
sepal_width: 3.5
petal_length: 1.4
petal_width: 0.2' metadata={'source': './iris.csv', 'row': 0, 'species': 'setosa'}
page_content='sepal_length: 4.9
sepal_width: 3.0
petal_length: 1.4
petal_width: 0.2' metadata={'source': './iris.csv', 'row': 1, 'species': 'setosa'}


# HTML

In [1]:
from langchain_community.document_loaders import UnstructuredHTMLLoader

In [320]:
from langchain_community.document_loaders import UnstructuredURLLoader

In [327]:
loader = UnstructuredURLLoader(urls=['https://diataxis.fr'], mode='elements')

In [328]:
data = loader.load()

In [329]:
len(data)

61

In [333]:
data[28].metadata

{'languages': ['eng'], 'parent_id': '312017038db4f2ad1e9332fc5a40bb9d', 'filetype': 'text/html', 'url': 'https://diataxis.fr', 'category': 'NarrativeText'}

In [45]:
data[10].metadata

{'category_depth': 1,
 'link_texts': ['Reference'],
 'link_urls': ['reference/'],
 'link_start_indexes': [0],
 'languages': ['eng'],
 'parent_id': '0fad7a6f72048eb5d179263aebc48c5a',
 'filetype': 'text/html',
 'url': 'https://diataxis.fr',
 'category': 'ListItem'}

In [334]:
data[10].metadata

{'category_depth': 1, 'link_texts': ['Reference'], 'link_urls': ['reference/'], 'link_start_indexes': [0], 'languages': ['eng'], 'parent_id': '0fad7a6f72048eb5d179263aebc48c5a', 'filetype': 'text/html', 'url': 'https://diataxis.fr', 'category': 'ListItem'}

In [51]:
print(data[40].page_content)

Reference


In [332]:
print(data[28].page_content)

Diátaxis is a way of thinking about and doing documentation.


In [54]:
for i, element in enumerate(data[27:37]):
    # print(i)
    # print(element.metadata)
    print(element.page_content)

A systematic approach to technical documentation authoring.
Diátaxis is a way of thinking about and doing documentation.
It prescribes approaches to content, architecture and form that emerge from a systematic approach to understanding the needs of documentation users.
Diátaxis, from the Ancient Greek δῐᾰ́τᾰξῐς: dia (“across”) and taxis (“arrangement”).
Diátaxis identifies four distinct needs, and four corresponding forms of documentation - tutorials, how-to guides, technical reference and explanation. It places them in a systematic relationship, and proposes that documentation should itself be organised around the structures of those needs.
Diátaxis solves problems related to documentation content (what to write), style (how to write it) and architecture (how to organise it).
As well as serving the users of documentation, Diátaxis has value for documentation creators and maintainers. It is light-weight, easy to grasp and straightforward to apply. It doesn’t impose implementation const

# Markdown

In [2]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

In [10]:
loader = UnstructuredMarkdownLoader('README.md', mode='elements')

In [7]:
loader = UnstructuredMarkdownLoader('README.md', mode='paged')

In [11]:
data = loader.load()

In [12]:
len(data)

1458

In [349]:
data[0].metadata

{'source': 'README.md', 'last_modified': '2024-07-09T12:52:53', 'languages': ['eng'], 'filetype': 'text/markdown', 'filename': 'README.md', 'category': 'Title'}

In [350]:
data[0].metadata

{'source': 'README.md', 'last_modified': '2024-07-09T12:52:53', 'languages': ['eng'], 'filetype': 'text/markdown', 'filename': 'README.md', 'category': 'Title'}

In [351]:
data[700].metadata

{'source': 'README.md', 'last_modified': '2024-07-09T12:52:53', 'languages': ['eng'], 'filetype': 'text/markdown', 'filename': 'README.md', 'category': 'Title'}

In [352]:
data[700].page_content

'NeuralProphet (🥈28 ·  ⭐ 3.7K) - NeuralProphet: A simple forecasting package.'

# JSON

In [58]:
import json

In [87]:
from pprint import pprint

In [353]:
from langchain_community.document_loaders import JSONLoader

In [354]:
loader = JSONLoader(file_path='chat.json', jq_schema='.', text_content=False)

In [355]:
data = loader.load()

In [356]:
len(data)

1

In [357]:
data[0].metadata

{'source': '/home/santhosh/Projects/RAG/Langchain/chat.json', 'seq_num': 1}

In [358]:
pprint(data[0].page_content)

('{"image": {"creation_timestamp": 1675549016, "uri": '
 '"image_of_the_chat.jpg"}, "is_still_participant": true, "joinable_mode": '
 '{"link": "", "mode": 1}, "magic_words": [], "messages": [{"content": "Bye!", '
 '"sender_name": "User 2", "timestamp_ms": 1675597571851}, {"content": "Oh no '
 'worries! Bye", "sender_name": "User 1", "timestamp_ms": 1675597435669}, '
 '{"content": "No Im sorry it was my mistake, the blue one is not for sale", '
 '"sender_name": "User 2", "timestamp_ms": 1675596277579}, {"content": "I '
 'thought you were selling the blue one!", "sender_name": "User 1", '
 '"timestamp_ms": 1675595140251}, {"content": "Im not interested in this bag. '
 'Im interested in the blue one!", "sender_name": "User 1", "timestamp_ms": '
 '1675595109305}, {"content": "Here is $129", "sender_name": "User 2", '
 '"timestamp_ms": 1675595068468}, {"photos": [{"creation_timestamp": '
 '1675595059, "uri": "url_of_some_picture.jpg"}], "sender_name": "User 2", '
 '"timestamp_ms": 16755950

In [368]:
loader = JSONLoader(file_path='chat.json', jq_schema='.messages[].content', text_content=False)

In [363]:
loader = JSONLoader(file_path='chat.json', jq_schema='.messages[]', text_content=False)

In [359]:
loader = JSONLoader(file_path='chat.json', jq_schema='.title', text_content=False)

In [369]:
data = loader.load()

In [370]:
len(data)

11

In [371]:
data[0].metadata

{'source': '/home/santhosh/Projects/RAG/Langchain/chat.json', 'seq_num': 1}

In [372]:
pprint(data[0].page_content)

'Bye!'


# Word

In [108]:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/chunking/dispatch.py

In [231]:
loader = UnstructuredWordDocumentLoader(file_path='Polars.docx', mode='elements', chunking_strategy='by_title', 
                                        max_characters=200, new_after_n_chars=20)

In [232]:
data = loader.load()

In [233]:
len(data)

67

In [234]:
data[0].metadata

{'source': 'Polars.docx', 'emphasized_text_contents': ['Introduction'], 'emphasized_text_tags': ['b'], 'filename': 'Polars.docx', 'languages': ['eng'], 'last_modified': '2024-07-09T17:49:42', 'orig_elements': 'eJxtj81uwjAQhF8F+VyjkDi4yRtw64EbQtbi3QRL/omSRYIi3h07nCr1OvONZub0FOQpUGTjUPQbYS+oW2hR0qAbqbDZS9BoJbbfzb7W1ClVia+NCMSAwJAzT2GBaUzzwyBNfM3SLhMUpiss7pfQMN3Z2BQ59yzZPolD5DnhzbJLUZz/oRnGD3lZ7cF5ihCoTPxJHuZli8neyxIPcbzBSB+c4rgGPCxsQkI3OFqP1VWtZKVl1R13ulddr2rxymBpK/6fRUV/TGvb0bEn8Tq/Ad4UXrY=', 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'CompositeElement'}

In [235]:
data[0].page_content

'Introduction'

In [236]:
data[40].page_content

'q.show_graph(optimized=False)123456789python'

# PDF

https://github.com/py-pdf/benchmarks

In [79]:
from langchain_community.document_loaders import PyMuPDFLoader

In [80]:
loader = PyMuPDFLoader(file_path='how-to-formulate-successful-business-strategy.pdf', extract_images=True)

In [81]:
data = loader.load()

In [377]:
len(data)

23

In [378]:
data[0].metadata

{'source': 'how-to-formulate-successful-business-strategy.pdf', 'file_path': 'how-to-formulate-successful-business-strategy.pdf', 'page': 0, 'total_pages': 23, 'format': 'PDF 1.7', 'title': 'How to Formulate a Successful Business Strategy', 'author': 'Harvard Business School Online', 'subject': '', 'keywords': '', 'creator': 'Adobe InDesign 17.4 (Macintosh)', 'producer': 'Adobe PDF Library 16.0.7', 'creationDate': "D:20221018124609-04'00'", 'modDate': "D:20221018124635-04'00'", 'trapped': ''}

In [379]:
data[0].page_content

'How to Formulate \na\xa0Successful \nBusiness\xa0Strategy\nOnline'

In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

In [2]:
loader = UnstructuredPDFLoader('how-to-formulate-successful-business-strategy.pdf', mode='elements', strategy="auto")

In [3]:
data = loader.load()

In [383]:
len(data)

177

In [267]:
data[0].metadata

{'source': 'how-to-formulate-successful-business-strategy.pdf', 'coordinates': {'points': ((71.94, 211.28069999999997), (71.94, 407.2807), (587.8439999999999, 407.2807), (587.8439999999999, 211.28069999999997)), 'system': 'PixelSpace', 'layout_width': 792.0, 'layout_height': 612.0}, 'filename': 'how-to-formulate-successful-business-strategy.pdf', 'languages': ['eng'], 'last_modified': '2023-02-24T01:43:06', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'Title'}

In [268]:
data[0].page_content

'How to Formulate a Successful Business Strategy'

In [269]:
data[16].page_content

'Strategic planning is the ongoing organizational process of using available knowledge to document a business’s intended direction. This process is used to prioritize efforts, effectively allocate resources, align shareholders and employees on goals, and ensure those goals are backed by data and sound reasoning.'

# Directory

In [270]:
from langchain_community.document_loaders import DirectoryLoader

In [304]:
loader = DirectoryLoader(".", glob="**/*.json", loader_cls=JSONLoader, loader_kwargs={'jq_schema': '.', 'text_content':False},
                         show_progress=True, use_multithreading=True)

In [305]:
docs = loader.load()






100%|██████████| 1/1 [00:00<00:00, 154.49it/s][A[A


In [306]:
len(docs)

1

In [307]:
docs[0].page_content

'{"image": {"creation_timestamp": 1675549016, "uri": "image_of_the_chat.jpg"}, "is_still_participant": true, "joinable_mode": {"link": "", "mode": 1}, "magic_words": [], "messages": [{"content": "Bye!", "sender_name": "User 2", "timestamp_ms": 1675597571851}, {"content": "Oh no worries! Bye", "sender_name": "User 1", "timestamp_ms": 1675597435669}, {"content": "No Im sorry it was my mistake, the blue one is not for sale", "sender_name": "User 2", "timestamp_ms": 1675596277579}, {"content": "I thought you were selling the blue one!", "sender_name": "User 1", "timestamp_ms": 1675595140251}, {"content": "Im not interested in this bag. Im interested in the blue one!", "sender_name": "User 1", "timestamp_ms": 1675595109305}, {"content": "Here is $129", "sender_name": "User 2", "timestamp_ms": 1675595068468}, {"photos": [{"creation_timestamp": 1675595059, "uri": "url_of_some_picture.jpg"}], "sender_name": "User 2", "timestamp_ms": 1675595060730}, {"content": "Online is at least $100", "sende

# YouTube

In [31]:
from langchain_community.document_loaders import YoutubeLoader

In [40]:
video_url = 'https://www.youtube.com/watch?v=LKCVKw9CzFo'

In [66]:
loader = YoutubeLoader(video_id='LKCVKw9CzFo', add_video_info=True)

In [67]:
data = loader.load()

In [68]:
len(data)

1

In [69]:
data[0].metadata

{'source': 'LKCVKw9CzFo',
 'title': '100+ Linux Things you Need to Know',
 'description': 'Unknown',
 'view_count': 713960,
 'thumbnail_url': 'https://i.ytimg.com/vi/LKCVKw9CzFo/hq720.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGDsgUShyMA8=&rs=AOn4CLBBFIS2VzRUY1irEVO3f41PZkS1aw',
 'publish_date': '2024-07-06 00:00:00',
 'length': 742,
 'author': 'Fireship'}

In [70]:
data[0].page_content

"statistically 96% of the humans watching this video are not using Linux and that's just like really sad because it's a superior free open source operating system but only has a 4% share of the PC market luckily though 96% of the non-human Bots watching this video are using Linux because it is the dominant OS on the server if you're a programmer or developer you need to know Linux that's where your code will eventually run and fail and if you can't SSH into a Linux terminal and fix it you are screwed in today's video you'll learn everything you need to know about Linux by looking at 101 essential Concepts over the next 10 minutes if you survive until the end you should magically grow neck beard and be able to technobabble like an arch user before one can understand Linux though one must recognize what came before it Unix an operating system developed at AT&T Bell labs in the 70s its development led to a standardization called posix or portable operating system interface to ensure that 

# Wikipedia

In [47]:
from langchain_community.document_loaders import WikipediaLoader

In [71]:
loader = WikipediaLoader(query='Generative AI', load_max_docs=5, doc_content_chars_max=5000, load_all_available_meta=True)

In [72]:
data = loader.load()

In [73]:
len(data)

5

In [76]:
data[0].metadata.keys()

dict_keys(['title', 'summary', 'source', 'categories', 'page_url', 'image_urls', 'related_titles', 'parent_id', 'references', 'revision_id', 'sections'])

In [78]:
for i in data:
    print(i.metadata['title'])

Generative artificial intelligence
AI boom
Generative pre-trained transformer
ChatGPT
Artificial intelligence


In [77]:
data[0].metadata['title']

'Generative artificial intelligence'

In [60]:
data[0].page_content

'Generative artificial intelligence (generative AI, GenAI, or GAI) is artificial intelligence capable of generating text, images, videos, or other data using generative models, often in response to prompts. Generative AI models learn the patterns and structure of their input training data and then generate new data that has similar characteristics.\nImprovements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini and LLaMA, text-to-image artificial intelligence image generation systems such as Stable Diffusion, Midjourney and DALL-E, and text-to-video AI generators such as Sora. Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have developed generative AI models.\nGenerative AI has uses across a wide range of industries, including software development, healthcare, finance, entertainme