
- Document loaders, to convert any document to plain text
    
    文档加载器，将任何文档转换为纯文本
    
- Text splitters, to split each large document into many smaller ones
    
    文本分割器，将每个大文档拆分成多个小文档
    
- Embeddings models, to create a numeric representation of the meaning of each split
    
    嵌入模型，用于创建每个切分意义的数值表示

In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings

## Load the document 

loader = TextLoader("./resources/novel_sample.txt")
doc = loader.load()

## Split the document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=20,
)
chunks = text_splitter.split_documents(doc)

## Generate embeddings

embeddings_model = OllamaEmbeddings(
    base_url="http://localhost:11434",  # Default Ollama server URL
    model="nomic-embed-text"  # Specify an embedding model available in your Ollama installation
)
embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in chunks]
)

embeddings

[[0.034702655,
  0.06108691,
  -0.104553506,
  0.0004990069,
  0.0028659361,
  0.0042683957,
  0.044904742,
  0.020312155,
  0.015674539,
  -0.029570904,
  -0.10089369,
  -0.018836645,
  0.02462114,
  0.05830507,
  0.014520485,
  -0.042356353,
  -0.09267805,
  -0.022755688,
  -0.056249876,
  0.01994593,
  -0.033650875,
  0.025364937,
  -0.05529755,
  0.020620411,
  -0.013892355,
  0.056751378,
  -0.01772632,
  0.055154227,
  -0.015944986,
  -0.039569914,
  0.0040777707,
  0.009678016,
  0.014040016,
  0.0058912444,
  -0.01748948,
  0.057171438,
  -0.061467156,
  0.052767973,
  0.058255427,
  0.039237622,
  0.07053009,
  0.056373935,
  0.037933163,
  -0.061361074,
  0.046425045,
  -0.024074856,
  0.043417845,
  -0.017166935,
  0.07545527,
  -0.02769911,
  0.03043958,
  0.030713072,
  0.07222929,
  0.02855009,
  0.06390899,
  0.08858945,
  -0.03201263,
  -0.013942841,
  -0.010252175,
  -0.052643925,
  0.0152835995,
  0.01971981,
  -0.021242643,
  -0.01952762,
  0.056065023,
  -0.05326576