# Data Ingestion - Document Loaders

https://python.langchain.com/docs/integrations/document_loaders/

In [1]:
## Text Loader
from langchain_community.document_loaders import TextLoader

loader = TextLoader('speech.txt')
loader

<langchain_community.document_loaders.text.TextLoader at 0x107b80070>

In [2]:
text_documents=loader.load()
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='In deep learning, transformer is a neural network architecture based on the multi-head attention mechanism, in which text is converted to numerical representations called tokens, and each token is converted into a vector via lookup from a word embedding table.[1] At each layer, each token is then contextualized within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key tokens to be amplified and less important tokens to be diminished.\n\nTransformers have the advantage of having no recurrent units, therefore requiring less training time than earlier recurrent neural architectures (RNNs) such as long short-term memory (LSTM).[2] Later variations have been widely adopted for training large language models (LLMs) on large (language) datasets.[3]\n\nThe modern version of the transformer was proposed in the 2017 paper "Attention Is All You Need" by r

In [3]:
## Reading a PDF File
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('attention.pdf')

In [None]:
pdf_documents=loader.load()
pdf_documents

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszk

In [None]:
## Web based loader
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(web_paths=('https://en.wikipedia.org/wiki/Resident_Evil_4',),)
web_docs = loader.load()
web_docs

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Resident_Evil_4', 'title': 'Resident Evil 4 - Wikipedia', 'language': 'en'}, page_content='\n\n\n\nResident Evil 4 - Wikipedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMain menu\n\n\n\n\n\nMain menu\nmove to sidebar\nhide\n\n\n\n\t\tNavigation\n\t\n\n\nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us\n\n\n\n\n\n\t\tContribute\n\t\n\n\nHelpLearn to editCommunity portalRecent changesUpload fileSpecial pages\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\nSearch\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAppearance\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nDonate\n\nCreate account\n\nLog in\n\n\n\n\n\n\n\n\nPersonal tools\n\n\n\n\n\nDonate Create account Log in\n\n\n\n\n\n\t\tPages for logged out editors learn more\n\n\n\nContributionsTalk\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nContents\nmove to sidebar\nhide\n\n\n\n\n(Top)\n

In [9]:
import bs4
loader = WebBaseLoader(web_paths=('https://en.wikipedia.org/wiki/Resident_Evil_4',), 
                       bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                           class_=("infobox ib-video-game hproduct")
                       )))
web_docs = loader.load()
web_docs

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Resident_Evil_4'}, page_content='Resident Evil 4North American cover artDeveloper(s)Capcom Production Studio 4[a]Publisher(s)\nCapcom\nWindowsJP: CapcomNA/PAL: UbisoftOculus Quest 2WW: Oculus Studios\nDirector(s)Shinji MikamiProducer(s)Hiroyuki KobayashiDesigner(s)Hiroshi ShibataKouji KakaeShigenori NishikawaProgrammer(s)Kiyohiko SakataWriter(s)Shinji MikamiComposer(s)Misao SenbongiShusaku UchiyamaSeriesResident EvilPlatform(s)\nGameCube\nPlayStation 2WindowsWiiiOSZeeboPlayStation 3Xbox 360AndroidPlayStation 4Xbox OneNintendo SwitchOculus Quest 2\nRelease\nJanuary 11, 2005\nGameCubeNA: January 11, 2005JP: January 27, 2005PAL: March 18, 2005PlayStation 2NA: October 25, 2005EU: November 4, 2005AU: November 9, 2005JP: December 1, 2005WindowsAU: March 1, 2007EU: March 2, 2007NA: May 15, 2007JP: June 7, 2007WW: February 27, 2014 (HD)WiiJP: May 31, 2007NA: June 19, 2007EU: June 29, 2007AU: July 5, 2007iOSNA: July 27, 2009JP: July 2

In [12]:
## Arxiv
from langchain_community.document_loaders import ArxivLoader
docs = ArxivLoader(query="1706.03762", load_max_docs=2).load()
docs

[Document(metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntr