# Document loaders
Document Loaders are responsible for loading documents from a variety of sources.

# How to load PDFs

In [7]:
from langchain_community.document_loaders import PyPDFLoader

# Initialize the PyPDFLoader with the file path
loader = PyPDFLoader(
    file_path="C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf",
    password=None,
    extract_images=False
)

# Load all documents synchronously
docs = loader.load()

# Print content and metadata for each page
for i, doc in enumerate(docs):
    print(f"Page {i}:")
    print(doc.page_content[:100])  # Print first 100 characters of page content
    print(doc.metadata)


Page 0:
Machine learning Life cycle
computer application (D. Y. Patil Agriculture and Technical University, 
{'source': 'C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf', 'page': 0}
Page 1:
Machine learning Life cycle
Machine  learning  has  given  the  computer  systems  the  abil ities  
{'source': 'C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf', 'page': 1}
Page 2:
determine the efficiency of the output. The more will be the data, the more accurate wil l be 
the p
{'source': 'C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf', 'page': 2}
Page 3:
oMissing Values
oDuplicate data
oInvalid data
oNoise
4. Data Analysis
Now the cleaned and prepared d
{'source': 'C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf', 'page': 3}
Page 4:
7. Deployment
The last step of machine learning life cycle is depl oyment, where we
deploy the model
{'source': 'C:\\Users\\Admin\\Downloads\\machine-learning-life-cycle.pdf', 'page': 4}


In [8]:
#pip install pypdf

# How to load web pages

In [9]:
%pip install -qU langchain-community beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [16]:
import asyncio
import nest_asyncio
from langchain_community.document_loaders import WebBaseLoader

# Allow nested async calls
nest_asyncio.apply()

async def load_page_with_retry(url, retries=3):
    loader = WebBaseLoader(web_paths=[url])
    for attempt in range(retries):
        try:
            docs = []
            async for doc in loader.alazy_load():
                docs.append(doc)
            return docs
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                await asyncio.sleep(2)  # wait before retrying
    return []

# Usage
page_url = "https://python.langchain.com/docs/how_to/chatbots_memory/"
docs = await load_page_with_retry(page_url)
print(f"Number of documents loaded: {len(docs)}")

# Display the content of the loaded documents
for doc in docs:
    print(doc.page_content)  # or doc if you want to see the entire document object


Number of documents loaded: 1





How to add memory to chatbots | 🦜️🔗 LangChain






Skip to main contentIntegrationsAPI ReferenceMoreContributingPeopleError referenceLangSmithLangGraphLangChain HubLangChain JS/TSv0.3v0.3v0.2v0.1💬SearchIntroductionTutorialsBuild a Question Answering application over a Graph DatabaseTutorialsBuild a Simple LLM Application with LCELBuild a Query Analysis SystemBuild a ChatbotConversational RAGBuild an Extraction ChainBuild an AgentTaggingdata_generationBuild a Local RAG ApplicationBuild a PDF ingestion and Question/Answering systemBuild a Retrieval Augmented Generation (RAG) AppVector stores and retrieversBuild a Question/Answering system over SQL dataSummarize TextHow-to guidesHow-to guidesHow to use tools in a chainHow to use a vectorstore as a retrieverHow to add memory to chatbotsHow to use example selectorsHow to map values to a graph databaseHow to add a semantic layer over graph databaseHow to invoke runnables in parallelHow to stream chat model

In [17]:
import requests
from bs4 import BeautifulSoup

def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Customize the following line to match the HTML structure
    content = soup.find_all('p')  # Example: find all paragraph tags
    
    # Combine the content from all found elements
    full_content = "\n".join([p.get_text() for p in content])
    return full_content

# Usage
url = "https://python.langchain.com/docs/how_to/chatbots_memory/"
page_content = scrape_page(url)
print(page_content)


A key feature of chatbots is their ability to use content of previous conversation turns as context. This state management can take several forms, including:
We'll go into more detail on a few techniques below!
This how-to guide previously built a chatbot using RunnableWithMessageHistory. You can access this version of the guide in the v0.2 docs.
As of the v0.3 release of LangChain, we recommend that LangChain users take advantage of LangGraph persistence to incorporate memory into new LangChain applications.
If your code is already relying on RunnableWithMessageHistory or BaseChatMessageHistory, you do not need to make any changes. We do not plan on deprecating this functionality in the near future as it works for simple chat applications and any code that uses RunnableWithMessageHistory will continue to work as expected.
Please see How to migrate to LangGraph Memory for more details.
You'll need to install a few packages, and have your OpenAI API key set as an environment variable na

  self._namespaces = namespaces or {}


# How to load CSVs

* Loading a CSV file:

In [19]:
from langchain_community.document_loaders.csv_loader import CSVLoader

file_path = "C:\/Users\\Admin\\Desktop\\10-20-2024\\data\\user_behavior_dataset.csv"  # Update this to your CSV file path

loader = CSVLoader(file_path=file_path)
data = loader.load()

# Print the first two records
for record in data[:2]:
    print(record)


page_content='User ID: 1
Device Model: Google Pixel 5
Operating System: Android
App Usage Time (min/day): 393
Screen On Time (hours/day): 6.4
Battery Drain (mAh/day): 1872
Number of Apps Installed: 67
Data Usage (MB/day): 1122
Age: 40
Gender: Male
User Behavior Class: 4' metadata={'source': 'C:\\/Users\\Admin\\Desktop\\10-20-2024\\data\\user_behavior_dataset.csv', 'row': 0}
page_content='User ID: 2
Device Model: OnePlus 9
Operating System: Android
App Usage Time (min/day): 268
Screen On Time (hours/day): 4.7
Battery Drain (mAh/day): 1331
Number of Apps Installed: 42
Data Usage (MB/day): 944
Age: 47
Gender: Female
User Behavior Class: 3' metadata={'source': 'C:\\/Users\\Admin\\Desktop\\10-20-2024\\data\\user_behavior_dataset.csv', 'row': 1}


* Customizing CSV Loader

In [20]:
loader = CSVLoader(
    file_path=file_path,
    csv_args={
        "delimiter": ",",  # Separator
        "quotechar": '"',  # Character used for quotes
        "fieldnames": ["MLB Team", "Payroll in millions", "Wins"],  # Specify field names
    },
)

data = loader.load()
for record in data[:2]:
    print(record)


page_content='MLB Team: User ID
Payroll in millions: Device Model
Wins: Operating System
None: App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class' metadata={'source': 'C:\\/Users\\Admin\\Desktop\\10-20-2024\\data\\user_behavior_dataset.csv', 'row': 0}
page_content='MLB Team: 1
Payroll in millions: Google Pixel 5
Wins: Android
None: 393,6.4,1872,67,1122,40,Male,4' metadata={'source': 'C:\\/Users\\Admin\\Desktop\\10-20-2024\\data\\user_behavior_dataset.csv', 'row': 1}


* Specifying Source Colum

# Load CSV from String

In [22]:
import tempfile
from langchain_community.document_loaders.csv_loader import CSVLoader

string_data = """
Team,Payroll (millions),Wins
Nationals,81.34,98
Reds,82.20,97
Yankees,197.96,95
Giants,117.62,94
""".strip()

with tempfile.NamedTemporaryFile(delete=False, mode="w+") as temp_file:
    temp_file.write(string_data)
    temp_file_path = temp_file.name

loader = CSVLoader(file_path=temp_file_path)
data = loader.load()
for record in data[:2]:
    print(record)


page_content='Team: Nationals
Payroll (millions): 81.34
Wins: 98' metadata={'source': 'C:\\Users\\Admin\\AppData\\Local\\Temp\\tmpl3whpezq', 'row': 0}
page_content='Team: Reds
Payroll (millions): 82.20
Wins: 97' metadata={'source': 'C:\\Users\\Admin\\AppData\\Local\\Temp\\tmpl3whpezq', 'row': 1}
