In [15]:
import random
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

device = 'mps'

### Llama index setup

In [16]:
import nest_asyncio
nest_asyncio.apply()
import os
os.environ['LLAMA_CLOUD_API_KEY'] = ''
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data('../lbg_relationship_tnc.pdf')

llama_text = documents[0].text

Started parsing the file under job_id e1c87a8b-421b-44ae-91df-74648c084392


### Split text

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
llama_split = text_splitter.split_text(llama_text)

In [18]:
# turn the split texts into a list[dict] for easier save to csv
llama_text_n_pages = []
for page_n, page in enumerate(llama_split):
    llama_text_n_pages.append({
        'page_n': page_n, # number for reference
        'sentence_chunk': page
    })

### Sample the chunked texts

In [19]:
random.sample(llama_text_n_pages, k=1)

[{'page_n': 36,
  'sentence_chunk': '9.5 In compliance with the Payment Services Regulations, most domestic and cross border payments made within the UK or EEA must be made on the basis that the person making the payment pays any charges levied by its bank or other financial institution and the person receiving the payment pays any charges levied by its bank or other financial institution. We will notify you if a payment instruction does not comply with these requirements. You agree that we are authorised to alter such payment instruction so that it meets these requirements and that we are authorised to act upon such altered payment instruction.\n---\nCore Banking Agreement\nCHARGES\nYou can find details of any standard charges pat apply to our Products and pe amount of pose charges in pe General Information On Payments, Charges & Contacts\n15\n---\n## Core Banking Agreement\n\nInterest paid and charged\n\nHow we pay and charge interest'}]

In [20]:
llama_split[0], len(llama_split)

('CORE BANKING AGREEMENT RELATIONSHIP TERMS & CONDITIONS\n---\nNO_CONTENT_HERE\n---\n|Contents|Page|\n|---|---|\n|Important information|1|\n|General| |\n|Information about our relationship with you|5|\n|Definitions and interpretation|6|\n|Your relationship with us|9|\n|Providing services to you|9|\n|Your warranties|9|\n|Who is authorised to give instructions to us|11|\n|Confidentiality|12|\n|Changes to the Terms and Conditions|13|\n|Terms applying to charges|14|\n|Interest paid and charged|16|\n|Third Party Providers| |\n|Partnerships|18|\n|Suspension of a product|19|\n|Termination|21|\n|Your rights to cancel|24|\n|What happens after termination or cancellation|24|\n|Liability|25|\n|Circumstances beyond your or our control|26|\n|Set off|26|\n|Other terms you need to know about your agreement with us|28|\n|Information about us and our regulators|29|\n|Payments| |\n|Payment instructions|32|\n|Providing and withdrawing consent|34|\n|Receipt of your payment instructions|36|\n|Processing yo

### Embedding split text

In [21]:
emb_model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1').to(device)
#mixedbread-ai/mxbai-embed-large-v1 all-mpnet-base-v2

In [22]:
llama_chunk_embs = emb_model.encode(llama_split, batch_size=16, convert_to_tensor=True)
llama_chunk_embs.shape, llama_chunk_embs

(torch.Size([148, 1024]),
 tensor([[-3.6564e-01, -6.9501e-01, -3.4750e-01,  ..., -3.3676e-01,
           9.8287e-02,  3.7684e-01],
         [-4.5282e-01, -8.0915e-01, -4.4601e-01,  ..., -5.7205e-01,
          -7.6268e-01,  6.9607e-01],
         [-3.5574e-01, -7.5971e-01, -3.9172e-01,  ...,  1.7896e-04,
          -3.0669e-01,  3.0704e-01],
         ...,
         [ 1.6842e-02, -8.7632e-02,  4.0924e-02,  ..., -1.3430e-01,
          -3.5314e-01,  3.4196e-01],
         [-8.3761e-01, -6.4777e-01, -3.3409e-01,  ..., -5.6327e-01,
          -1.8565e-01,  5.6231e-01],
         [-7.4132e-01, -1.0874e+00, -4.3583e-01,  ..., -5.0362e-01,
          -4.0053e-01,  6.6981e-01]], device='mps:0'))

### Save embeddings to csv (small size)

In [23]:
emb_chunks_df = pd.DataFrame(llama_text_n_pages)
emb_chunks_df['embedding'] = llama_chunk_embs.cpu().numpy().tolist()
emb_df_save_path = 'emb_chunks_df.csv'
emb_chunks_df.to_csv(emb_df_save_path, index=False)

### Inspect embeddings

In [24]:
emb_chunks_df['embedding']

0      [-0.3656424283981323, -0.6950072050094604, -0....
1      [-0.45282310247421265, -0.8091470003128052, -0...
2      [-0.35574114322662354, -0.7597120404243469, -0...
3      [-0.11259084939956665, -0.4700072407722473, -0...
4      [-0.09853293001651764, -0.5731832981109619, -0...
                             ...                        
143    [0.08263994008302689, -0.3015971779823303, 0.4...
144    [0.4743483364582062, 0.027600960806012154, -0....
145    [0.016842350363731384, -0.08763177692890167, 0...
146    [-0.837609589099884, -0.6477715373039246, -0.3...
147    [-0.7413213849067688, -1.0873749256134033, -0....
Name: embedding, Length: 148, dtype: object

In [25]:
emb_chunks_df.describe().round(2)

Unnamed: 0,page_n
count,148.0
mean,73.5
std,42.87
min,0.0
25%,36.75
50%,73.5
75%,110.25
max,147.0
