#### loading CSVs

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [2]:
file_path = r"mlb_teams_2012.csv"

In [3]:
loader = CSVLoader(file_path=file_path)

In [4]:
data = loader.load()

In [5]:
type(data)

list

In [6]:
len(data)

30

In [8]:
print(data)

[Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 0}, page_content='Team: Nationals\n"Payroll (millions)": 81.34\n"Wins": 98'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 1}, page_content='Team: Reds\n"Payroll (millions)": 82.20\n"Wins": 97'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 2}, page_content='Team: Yankees\n"Payroll (millions)": 197.96\n"Wins": 95'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 3}, page_content='Team: Giants\n"Payroll (millions)": 117.62\n"Wins": 94'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 4}, page_content='Team: Braves\n"Payroll (millions)": 83.31\n"Wins": 94'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 5}, page_content='Team: Athletics\n"Payroll (millions)": 55.37\n"Wins": 94'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 6}, page_content='Team: Rangers\n"Payroll (millions)": 120.51\n"Wins": 93'), Document(metadata={'source': 'mlb_teams_2012.csv', 'row': 7

Parsing the data

In [9]:
loader = CSVLoader(
    file_path = file_path,
    csv_args  = {
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["MLB Team", "Payroll in millions", "Wins"],
    },
)

In [11]:
data = loader.load()

for record in data[:2]:
    print(record)
    print()

page_content='MLB Team: Team
Payroll in millions: "Payroll (millions)"
Wins: "Wins"' metadata={'source': 'mlb_teams_2012.csv', 'row': 0}

page_content='MLB Team: Nationals
Payroll in millions: 81.34
Wins: 98' metadata={'source': 'mlb_teams_2012.csv', 'row': 1}



Specify a column to identify the document source

In [12]:
loader = CSVLoader(file_path=file_path, source_column="Team")

In [13]:
data = loader.load()

In [14]:
data[:5]

[Document(metadata={'source': 'Nationals', 'row': 0}, page_content='Team: Nationals\n"Payroll (millions)": 81.34\n"Wins": 98'),
 Document(metadata={'source': 'Reds', 'row': 1}, page_content='Team: Reds\n"Payroll (millions)": 82.20\n"Wins": 97'),
 Document(metadata={'source': 'Yankees', 'row': 2}, page_content='Team: Yankees\n"Payroll (millions)": 197.96\n"Wins": 95'),
 Document(metadata={'source': 'Giants', 'row': 3}, page_content='Team: Giants\n"Payroll (millions)": 117.62\n"Wins": 94'),
 Document(metadata={'source': 'Braves', 'row': 4}, page_content='Team: Braves\n"Payroll (millions)": 83.31\n"Wins": 94')]

#### PDF

In [15]:
pip install -qU pypdf

Note: you may need to restart the kernel to use updated packages.


In [16]:
from langchain_community.document_loaders import PyPDFLoader

In [17]:
file_path = r"./layout-parser-paper.pdf"

In [18]:
loader = PyPDFLoader(file_path)

In [19]:
pages = []

In [20]:
async for page in loader.alazy_load():
    pages.append(page)

In [21]:
print(pages[0].page_content)

LayoutParser: A Uniﬁed Toolkit for Deep
Learning Based Document Image Analysis
Zejiang Shen1 (  ), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain
Lee4, Jacob Carlson3, and Weining Li5
1 Allen Institute for AI
shannons@allenai.org
2 Brown University
ruochen zhang@brown.edu
3 Harvard University
{melissadell,jacob carlson}@fas.harvard.edu
4 University of Washington
bcgl@cs.washington.edu
5 University of Waterloo
w422li@uwaterloo.ca
Abstract. Recent advances in document image analysis (DIA) have been
primarily driven by the application of neural networks. Ideally, research
outcomes could be easily deployed in production and extended for further
investigation. However, various factors like loosely organized codebases
and sophisticated model conﬁgurations complicate the easy reuse of im-
portant innovations by a wide audience. Though there have been on-going
eﬀorts to improve reusability and simplify deep learning (DL) model
development in disciplines like natural language processin

In [22]:
len(pages)

16