In [1]:
#! pip install langchain
#! pip install langchain_community
#! pip install openai

import os
from openai import OpenAI

# Document loading 
There are 80+ document loaders to load data from web-sites, YouTube etc.


In [2]:
#! pip install pypdf

In [3]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [4]:
len(pages)

22

In [5]:
page = pages[0]
print(page.page_content[0:500])

MachineLearning-Lecture01  
Instructor (Andrew Ng):  Okay. Good morning. Welcome to CS229, the machine 
learning class. So what I wanna do today is ju st spend a little time going over the logistics 
of the class, and then we'll start to  talk a bit about machine learning.  
By way of introduction, my name's  Andrew Ng and I'll be instru ctor for this class. And so 
I personally work in machine learning, and I' ve worked on it for about 15 years now, and 
I actually think that machine learning i


In [6]:
page.metadata

{'source': 'data/MachineLearning-Lecture01.pdf', 'page': 0}

## Load YouTube 

In [7]:
from langchain.document_loaders.generic import GenericLoader

# Whisper parser converts audio to text
from langchain.document_loaders.parsers import OpenAIWhisperParser

# This module will allow us to pull audi. 
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [8]:
#! pip install yt_dlp
#! pip install pydub

In [9]:
url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=jGwO_UgTS7I
[youtube] jGwO_UgTS7I: Downloading webpage
[youtube] jGwO_UgTS7I: Downloading ios player API JSON
[youtube] jGwO_UgTS7I: Downloading web creator player API JSON
[youtube] jGwO_UgTS7I: Downloading m3u8 information
[info] jGwO_UgTS7I: Downloading 1 format(s): 140
[download] docs/youtube//Stanford CS229： Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a has already been downloaded
[download] 100% of   69.76MiB
[ExtractAudio] Not converting audio docs/youtube//Stanford CS229： Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a; file is already in target format m4a
Transcribing part 1!
Transcribing part 2!
Transcribing part 3!
Transcribing part 4!


In [12]:
len(docs)

4

In [13]:
docs[0].page_content[0:500]

"Welcome to CS229 Machine Learning. Uh, some of you know that this is a class that's taught at Stanford for a long time. And this is often the class that, um, I most look forward to teaching each year because this is where we've helped, I think, several generations of Stanford students become experts in machine learning, got- built many of their products and services and startups that I'm sure, many of you or probably all of you are using, uh, uh, today. Um, so what I want to do today was spend s"

## Load from URLs 

In [17]:
from langchain.document_loaders import WebBaseLoader

url = 'https://raw.githubusercontent.com/basecamp/handbook/refs/heads/master/how-we-work.md'
loader = WebBaseLoader(url)

In [18]:
docs = loader.load()
print(docs[0].page_content[:500])

# How We Work

## Remotely

37signals is a fully distributed company. Our team works from all over the world, across 5 continents. We don't care where employees choose to live and work, just that they're here to do great work on exceptional products, alongside a world-class team. We’ve been remote since we started, and our founders literally [wrote the book](https://basecamp.com/books/remote) on the subject.

You can work from anywhere, but please be sure to inform your People Ops team when you 


# Document splitter 

We will now explore different ways to split text 

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [20]:
# Artifically small values to see how things work 
chunk_size =26
chunk_overlap = 4

In [22]:
#setup both the splitters
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [26]:
# First string doesn't even require splitting since it is less than 26
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [27]:
# In this case the string is more than 26 char. In output notice the overlap of 4 "wxyz" in both chunks
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [33]:
# Note that recursive splitter is counting the spaces 
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [34]:
# By default char splitter splits on newline.
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [35]:
# Now we set separator to space, and now it behaves same as recursive splitter
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

# Recursive splitter 
Recursive splitter is recommended for generic text since we can provide rules on chunk boundaries beyond a simple separator.

In [36]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [38]:
print(some_text)

When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. 

  Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.


In [39]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
c_splitter.split_text(some_text)
# char splitter breaks the chunk in middle of a sentence

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [40]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)
r_splitter.split_text(some_text)

# Recursive splitter does a better job since we can specify rules like break on two newlines, even if chunk-size is not reached



["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [51]:
# will make smaller chunks now. Notice that period is being added to the next chunk
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", ". ", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related",
 '. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns',
 '. Carriage returns are the "backslash n" you see embedded in this string',
 '. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [53]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)
# This doesn't reall appear to be working since regex is not taking effect :-) 

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

## Split a PDF file 

In [54]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/MachineLearning-Lecture01.pdf")
pages = loader.load()


In [55]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)
docs = text_splitter.split_documents(pages)

In [56]:
len(docs)  # Note we have more chunks than the number of pages 

77

In [57]:
len(pages)

22

## Splitting on token 
LLM split on token which are often 4 characters long. 


In [60]:
#!pip install tiktoken

In [61]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [62]:
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [65]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

In [66]:
docs[0]

Document(metadata={'source': 'data/MachineLearning-Lecture01.pdf', 'page': 0}, page_content='MachineLearning-Lecture01  \n')

In [68]:
pages[0].metadata   # Note that docs[0] has the same metadata as the page it came from

{'source': 'data/MachineLearning-Lecture01.pdf', 'page': 0}