In [1]:
import fitz
import random
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter

device = 'mps'

In [2]:
pdf_path = 'lbg_relationship_tnc.pdf'
# lbg_relationship_tnc.pdf account_bank_tnc.pdf
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_n_texts = []

    for page_n, page in enumerate(doc):
        text = page.get_text()
        text = text.replace('\n', ' ').replace('  ', ' ')

        pages_n_texts.append({
            'page_n': page_n,
            'page_char_count': len(text),
            'page_word_count_raw': len(text.split(' ')),
            'page_sentence_count_raw': len(text.split('. ')),
            'page_token_count': len(text) / 4, # 1 token ~= 4 chars
            'text': text
        })

    return pages_n_texts

pages_n_texts = open_and_read_pdf(pdf_path)

In [3]:
random.sample(pages_n_texts, k=1)

[{'page_n': 48,
  'page_char_count': 15,
  'page_word_count_raw': 3,
  'page_sentence_count_raw': 1,
  'page_token_count': 3.75,
  'text': ' COMMUNICATION '}]

In [4]:
df = pd.DataFrame(pages_n_texts)
df.head()

Unnamed: 0,page_n,page_char_count,page_word_count_raw,page_sentence_count_raw,page_token_count,text
0,0,55,8,1,13.75,RELATIONSHIP TERMS & CONDITIONS CORE BANKING A...
1,1,0,1,1,0.0,
2,2,1619,271,39,404.75,Contents Important information 1 1 General 1...
3,3,247,40,3,61.75,Core Banking Agreement (“The Agreement”) conta...
4,4,1044,163,6,261.0,Core Banking Agreement 1 Important Information...


In [5]:
df.describe().round(2)

Unnamed: 0,page_n,page_char_count,page_word_count_raw,page_sentence_count_raw,page_token_count
count,56.0,56.0,56.0,56.0,56.0
mean,27.5,1997.73,343.46,10.29,499.43
std,16.31,1457.97,252.91,8.29,364.49
min,0.0,0.0,1.0,1.0,0.0
25%,13.75,268.75,45.0,2.0,67.19
50%,27.5,2216.0,392.0,10.0,554.0
75%,41.25,3082.5,532.0,14.5,770.62
max,55.0,4390.0,783.0,39.0,1097.5


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)

pages_n_chunks_new = []

for item in pages_n_texts:
    item['sentence_chunks'] = text_splitter.split_text(item['text'])
    for chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict['page_n'] = item['page_n']
        # print('\nNEW CHUNK: ', chunk)
        joined_sentence_chunk = ''.join(chunk).replace('  ', ' ').strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        joined_sentence_chunk = re.sub(r'\d+(\.\d+)+', '', joined_sentence_chunk)
        # print('\nMODIFIED CHUNK: ', joined_sentence_chunk)
        chunk_dict['sentence_chunk'] = joined_sentence_chunk

        # # add metadata
        chunk_dict['chunk_chars'] = len(joined_sentence_chunk)
        chunk_dict['chunk_words'] = len([word for word in joined_sentence_chunk.split(' ')])
        chunk_dict['chunk_tokens'] = len(joined_sentence_chunk) / 4

        pages_n_chunks_new.append(chunk_dict)

len(pages_n_chunks_new)

103

### Embedding chunks

In [7]:
pages_n_chunks_new

[{'page_n': 0,
  'sentence_chunk': 'RELATIONSHIP TERMS & CONDITIONS CORE BANKING AGREEMENT',
  'chunk_chars': 54,
  'chunk_words': 7,
  'chunk_tokens': 13.5},
 {'page_n': 2,
  'sentence_chunk': 'Contents Important information 1 1 General 1. Information about our relationship with you 5 2. Definitions and interpretation 6 3. Your relationship with us 9 4. Providing services to you 9 5. Your warranties 9 6. Who is authorised to give instructions to us 11 7. Confidentiality 12 8. Changes to the Terms and Conditions 13 9. Terms applying to charges 14 10. Interest paid and charged 16 11. Third Party Providers 12. Partnerships 18 13. Suspension of a product 19 14. Termination 21 15. Your rights to cancel 24 16. What happens after termination or cancellation 24 17. Liability 25 18. Circumstances beyond your or our control 26 19. Set off 26 20. Other terms you need to know about your agreement with us 28 21. Information about us and our regulators 29 2 Payments 22. Payment instructions 32 23. 

In [8]:
emb_model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1').to(device)
#mixedbread-ai/mxbai-embed-large-v1 all-mpnet-base-v2
text_chunks = [item['sentence_chunk'] for item in pages_n_chunks_new]
len(text_chunks)

103

In [9]:
text_chunk_embs = emb_model.encode(text_chunks, batch_size=16, convert_to_tensor=True)
text_chunk_embs.shape, text_chunk_embs

(torch.Size([103, 1024]),
 tensor([[-0.2853, -0.5011, -0.1342,  ..., -0.5641, -0.0289,  0.3733],
         [-0.2918, -0.7689, -0.4976,  ..., -0.5951,  0.0964,  0.5677],
         [-0.1221, -0.7235, -0.7040,  ..., -0.1265, -0.2623,  0.3828],
         ...,
         [-1.1459, -0.9477, -0.4285,  ..., -0.5960, -0.3391, -0.1244],
         [-0.4814, -0.1554, -0.4521,  ..., -0.4333,  0.0816,  0.3699],
         [ 0.1392, -0.1616,  0.5991,  ...,  0.2702,  0.2376, -0.7100]],
        device='mps:0'))

In [10]:
emb_chunks_df = pd.DataFrame(pages_n_chunks_new)
embs_only_df = pd.DataFrame(text_chunk_embs.to('cpu'))
emb_chunks_df['embedding'] = embs_only_df.values.tolist()
emb_df_save_path = 'emb_chunks_df.csv'
emb_chunks_df.to_csv(emb_df_save_path, index=False)

In [11]:
emb_chunks_df['embedding']

0      [-0.28527069091796875, -0.5010668635368347, -0...
1      [-0.2918225824832916, -0.7688528895378113, -0....
2      [-0.12205161154270172, -0.7234632968902588, -0...
3      [-0.3948034644126892, -0.8225910663604736, -0....
4      [-0.030174657702445984, -0.39982807636260986, ...
                             ...                        
98     [0.08262401819229126, -0.17973926663398743, -0...
99     [-0.7153952717781067, -0.7902843952178955, -0....
100    [-1.1459448337554932, -0.9477049112319946, -0....
101    [-0.4814109206199646, -0.1554449200630188, -0....
102    [0.13922488689422607, -0.16157987713813782, 0....
Name: embedding, Length: 103, dtype: object

In [12]:
emb_chunks_df.describe().round(2)

Unnamed: 0,page_n,chunk_chars,chunk_words,chunk_tokens
count,103.0,103.0,103.0,103.0
mean,28.92,1063.27,185.26,265.82
std,15.13,515.87,91.62,128.97
min,0.0,7.0,1.0,1.75
25%,16.5,629.5,112.5,157.38
50%,29.0,1402.0,234.0,350.5
75%,41.5,1467.0,258.0,366.75
max,55.0,1497.0,282.0,374.25
