In [139]:
import fitz
import random
import pandas as pd
from spacy.lang.en import English
import re
from sentence_transformers import SentenceTransformer

device = 'mps'

In [224]:
pdf_path = 'lloyds_personal_t_n_c.pdf'

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    pages_n_texts = []

    for page_n, page in enumerate(doc):
        text = page.get_text()
        text = text.replace('\n', ' ')
        tabs = page.find_tables()

        for tab in tabs:
            text += 'Helper table/s from this page: '
            for idx, line in enumerate(tab.extract()):
                text += f'Row: {idx}, Values: {line}'

        pages_n_texts.append({
            'page_n': page_n,
            'page_char_count': len(text),
            'page_word_count': len(text.split(' ')),
            'page_sentence_count_raw': len(text.split('. ')),
            'page_token_count': len(text) / 4, # 1 token ~= 4 chars
            'text': text
        })

    return pages_n_texts

pages_n_texts = open_and_read_pdf(pdf_path)

In [225]:
random.sample(pages_n_texts, k=1)

[{'page_n': 27,
  'page_char_count': 4528,
  'page_word_count': 836,
  'page_sentence_count_raw': 37,
  'page_token_count': 1132.0,
  'text': '28 Club Lloyds Account variable interest rates applicable to each part of the account balance AER% Gross p.a. % From £1 - £3,999.99  1.50 1.49  From £4,000 - £5,000  3.00 2.96 £5,000.01 + We won\'t pay interest on any balance amounts over £5,000 AER stands for Annual Equivalent Rate and illustrates what the interest rate would be if interest was paid and  compounded once each year.  We will pay your interest ‘gross’, this means we will not deduct tax automatically from it. Depending on your personal  circumstances you may need to pay tax on the interest you earn. You will be responsible for paying any tax you may owe  to HM Revenue & Customs (HMRC). We provide a range of exclusive offers on banking products to our Club Lloyds customers. You can find details about them  and the conditions that apply at: lloydsbank.com/clublloyds in our UK branche

In [226]:
df = pd.DataFrame(pages_n_texts)
df.head()

Unnamed: 0,page_n,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,143,28,1,35.75,Personal Banking Terms and Conditions and Ba...
1,1,381,78,3,95.25,2 This booklet: • Explains the agreement betwe...
2,2,4258,769,34,1064.5,3 Section A – Our agreement When you open a cu...
3,3,3604,634,11,901.0,4 Where to find what you want to know Section ...
4,4,2543,421,8,635.75,5 Section Can we take money from your account ...


In [227]:
df.describe().round(2)

Unnamed: 0,page_n,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,36.0,36.0,36.0,36.0,36.0
mean,17.5,3720.22,693.5,24.11,930.06
std,10.54,1166.06,217.72,9.68,291.51
min,0.0,143.0,28.0,1.0,35.75
25%,8.75,3269.25,609.0,19.0,817.31
50%,17.5,4038.0,745.5,25.0,1009.5
75%,26.25,4519.75,845.0,31.25,1129.94
max,35.0,5294.0,973.0,42.0,1323.5


In [228]:
nlp = English()
nlp.add_pipe('sentencizer')

for item in pages_n_texts:
    item['sentences'] = list(nlp(item['text']).sents)
    item['sentences'] = [str(sentence) for sentence in item['sentences']]
    item['page_sentence_count_spacy'] = len(item['sentences'])

In [229]:
df = pd.DataFrame(pages_n_texts)
df.describe().round(2)

Unnamed: 0,page_n,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,36.0,36.0,36.0,36.0,36.0,36.0
mean,17.5,3720.22,693.5,24.11,930.06,26.86
std,10.54,1166.06,217.72,9.68,291.51,9.76
min,0.0,143.0,28.0,1.0,35.75,1.0
25%,8.75,3269.25,609.0,19.0,817.31,21.0
50%,17.5,4038.0,745.5,25.0,1009.5,27.0
75%,26.25,4519.75,845.0,31.25,1129.94,33.5
max,35.0,5294.0,973.0,42.0,1323.5,46.0


In [252]:
sentence_chunk_size = 20
lag = 10

def split_list(input_list: list[str], lag: int=lag, slice_size: int=sentence_chunk_size) -> list[list[int]]:
    lag = slice_size - lag
    return [input_list[i:i+slice_size] for i in range(0, len(input_list) - slice_size + 1 + lag, lag)]

test_list = list(range(70))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29],
 [20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39],
 [30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 [40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59],
 [50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69],
 [60, 61, 62, 63, 64, 65, 66, 67, 68, 69]]

In [253]:
for item in pages_n_texts:
    item['sentence_chunks'] = split_list(item['sentences'])
    item['n_chunks'] = len(item['sentence_chunks'])

In [254]:
random.sample(pages_n_texts, k=1)

[{'page_n': 33,
  'page_char_count': 3375,
  'page_word_count': 562,
  'page_sentence_count_raw': 19,
  'page_token_count': 843.75,
  'text': "34 Direct debits The company or organisation you are paying will usually tell you 10\xa0working days before they change the amount or date  of the payment. Tax We pay any interest to you without taking tax off. Depending on your personal circumstances, you may need to pay tax on  the interest you earn. You are responsible for paying any tax you owe to HMRC. Any interest we pay you will count towards  your personal savings allowance. Taxes or costs may apply to you that aren't charged by us and/or won't be paid through us.  Company information Who we are • Our company details: Lloyds Bank plc (company number 2065) is a company registered with the Registrar of Companies for England and Wales.  Registered Office: 25 Gresham Street, London EC2V\xa07HN.  To find out more about our company, see the Registrar's website: companieshouse.gov.uk   or call 

In [255]:
df = pd.DataFrame(pages_n_texts)
df.describe().round(2)

Unnamed: 0,page_n,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,n_chunks
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,17.5,3720.22,693.5,24.11,930.06,26.86,2.22
std,10.54,1166.06,217.72,9.68,291.51,9.76,0.93
min,0.0,143.0,28.0,1.0,35.75,1.0,0.0
25%,8.75,3269.25,609.0,19.0,817.31,21.0,2.0
50%,17.5,4038.0,745.5,25.0,1009.5,27.0,2.0
75%,26.25,4519.75,845.0,31.25,1129.94,33.5,3.0
max,35.0,5294.0,973.0,42.0,1323.5,46.0,4.0


In [256]:
pages_n_chunks = []

for item in pages_n_texts:
    for sentence_chunk in item['sentence_chunks']:
        chunk_dict = {}
        chunk_dict['page_n'] = item['page_n']

        # join chunks into a paragraph-like structure
        joined_sentence_chunk = ''.join(sentence_chunk).replace('  ', ' ').strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk)
        chunk_dict['sentence_chunk'] = joined_sentence_chunk

        # add metadata
        chunk_dict['chunk_chars'] = len(joined_sentence_chunk)
        chunk_dict['chunk_words'] = len([word for word in joined_sentence_chunk.split(' ')])
        chunk_dict['chunk_tokens'] = len(joined_sentence_chunk) / 4

        pages_n_chunks.append(chunk_dict)

len(pages_n_chunks)

80

In [257]:
random.sample(pages_n_chunks, k=1)

[{'page_n': 24,
  'sentence_chunk': "25 Overdraft information for Under 19s, Islamic*, Cash* and Basic Accounts We don't offer arranged overdrafts on these accounts. We won't charge you interest if you go overdrawn. *We don't offer these accounts to new customers. Account fees The information below in the sections 'Withdrawing cash', 'Buying goods or services in a foreign currency' and 'Other services' apply to personal current accounts and savings accounts (if your savings accounts offer the relevant card  or services). Withdrawing cash If you take money in pounds from your account using any Lloyds Bank or Bank of Scotland cash machine in the UK, we won't charge you for the withdrawal. If you use any other cash machine you may be charged by the machine owner. The machine will show you the amount and tell you it will be taken from your account when you withdraw the cash. Paying-in cash using a machine You can pay cash into your account at one of our branches using a Lloyds Bank or Bank

In [258]:
df = pd.DataFrame(pages_n_chunks)
df.describe().round(2) 

Unnamed: 0,page_n,chunk_chars,chunk_words,chunk_tokens
count,80.0,80.0,80.0,80.0
mean,18.21,2386.96,431.11,596.74
std,9.5,736.47,125.6,184.12
min,2.0,1224.0,225.0,306.0
25%,10.75,1915.75,345.75,478.94
50%,17.5,2267.0,404.0,566.75
75%,26.25,2785.0,508.75,696.25
max,35.0,4682.0,776.0,1170.5


In [259]:
df.head()

Unnamed: 0,page_n,sentence_chunk,chunk_chars,chunk_words,chunk_tokens
0,2,3 Section A – Our agreement When you open a cu...,2307,391,576.75
1,2,They may be included in letters or application...,2155,381,538.75
2,2,You must not hold money for someone else in yo...,1920,348,480.0
3,3,4 Where to find what you want to know Section ...,3033,531,758.25
4,3,"Explains what happens if you use Open Banking,...",2237,365,559.25


### Embedding chunks

In [260]:
emb_model = SentenceTransformer('all-mpnet-base-v2').to(device)
#mixedbread-ai/mxbai-embed-large-v1
text_chunks = [item['sentence_chunk'] for item in pages_n_chunks]
len(text_chunks)

80

In [261]:
text_chunk_embs = emb_model.encode(text_chunks, batch_size=16, convert_to_tensor=True)
text_chunk_embs.shape, text_chunk_embs

(torch.Size([80, 768]),
 tensor([[ 0.0239, -0.0968,  0.0073,  ...,  0.0380, -0.0420,  0.0084],
         [ 0.0288, -0.0775, -0.0199,  ...,  0.0540, -0.0324,  0.0072],
         [ 0.0148, -0.0559, -0.0035,  ...,  0.0430, -0.0391, -0.0011],
         ...,
         [ 0.0060, -0.0109, -0.0181,  ...,  0.0235, -0.0758, -0.0099],
         [-0.0331, -0.0685, -0.0053,  ...,  0.0332, -0.0491,  0.0223],
         [ 0.0079, -0.0691, -0.0206,  ...,  0.0266, -0.0643, -0.0058]],
        device='mps:0'))

In [262]:
emb_chunks_df = pd.DataFrame(pages_n_chunks)
embs_only_df = pd.DataFrame(text_chunk_embs.to('cpu'))
emb_chunks_df['embedding'] = embs_only_df.values.tolist()
emb_df_save_path = 'emb_chunks_df.csv'
emb_chunks_df.to_csv(emb_df_save_path, index=False)

In [263]:
emb_chunks_df['embedding']

0     [0.023868504911661148, -0.09681934118270874, 0...
1     [0.02879343554377556, -0.07750559598207474, -0...
2     [0.014817321673035622, -0.055876679718494415, ...
3     [0.033143796026706696, -0.0936892107129097, -0...
4     [0.016689995303750038, -0.05574336647987366, -...
                            ...                        
75    [0.010411597788333893, -0.0706917941570282, -0...
76    [0.035197194665670395, -0.016004489734768867, ...
77    [0.00603171531111002, -0.01086839847266674, -0...
78    [-0.03309999406337738, -0.06848695874214172, -...
79    [0.007898915559053421, -0.06907203048467636, -...
Name: embedding, Length: 80, dtype: object