# Intro

## Standard modules

In [1]:
import os, sys, pickle
import pathlib
import numpy as np
from tqdm.auto import tqdm, trange

In [2]:
from sentence_transformers import util

## My modules

In [3]:
from toolbox.updater import time_is_now

In [4]:
from file2text import file2text

In [5]:
from first_rough_cleaning import first_text_cleaner

In [6]:
from embed_me_II_second_wipe import file2vecfile

  return torch._C._cuda_getDeviceCount() > 0


In [7]:
from jina4lote import jina4lote

In [8]:
from embed_me_II import file2vecfile

## Alternatives

In [9]:
import pymupdf4llm

## Parameters

In [10]:
N_RUNS=10

## Folders

In [11]:
PDF_FOLDER='./pdf/'

In [30]:
RTF_FOLDER='./rtf/'

In [36]:
TARGET_FOLDER='./NewProcessedData/'

## Files

In [17]:
pdfs=[file for file in os.listdir(PDF_FOLDER) if file.endswith('.pdf')]
pdfs.sort()

In [18]:
pdfs

['10_Telecommunications_VODAFONE GROUP PLC_2021.pdf',
 '1_Basic Materials_SYNTHOMER PLC_2021.pdf',
 '3_Consumer Staples_Imperial Brands PLC_2016.pdf',
 '8_Real Estate_RIGHTMOVE PLC_2016.pdf',
 '8_Real Estate_RIGHTMOVE PLC_2019.pdf']

# pdf + md vs. rtf

## From pdf to md

In [19]:
MD_FOLDER='./markdown/'

In [20]:
for pdf in tqdm(pdfs):
    md_text = pymupdf4llm.to_markdown(PDF_FOLDER+pdf)
    file_name=file2vecfile(pdf, extension='.pdf').replace('.txt', '.md')
    pathlib.Path(MD_FOLDER+file_name).write_bytes(md_text.encode())

  0%|          | 0/5 [00:00<?, ?it/s]

Processing ./pdf/10_Telecommunications_VODAFONE GROUP PLC_2021.pdf...
Processing ./pdf/1_Basic Materials_SYNTHOMER PLC_2021.pdf...
Processing ./pdf/3_Consumer Staples_Imperial Brands PLC_2016.pdf...
Processing ./pdf/8_Real Estate_RIGHTMOVE PLC_2016.pdf...
Processing ./pdf/8_Real Estate_RIGHTMOVE PLC_2019.pdf...


## New vectors vs. old vectors

In [40]:
for i_pdf, pdf in enumerate(tqdm(pdfs)):
    # read the md file
    with open(MD_FOLDER+file_name, 'r') as f:
        markdown_string = f.read()
    # embed the text
    md_vec=jina4lote(first_text_cleaner(markdown_string))
    
    # go to the old rtf file
    rtf=pdf.replace('.pdf', '.rtf')
    # get the embedding
    vecfile=file2vecfile(rtf)
    rtf_vec=np.genfromtxt(TARGET_FOLDER+vecfile)
    # compare the embeddings
    cos_sim=float(util.cos_sim(rtf_vec, md_vec))
    print(f'{pdf:})cos={cos_sim:.3f}')
    
    # save the md vec
    np.savetxt('md_'+vecfile, md_vec)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

10_Telecommunications_VODAFONE GROUP PLC_2021.pdf)cos=0.855


  0%|          | 0/1 [00:00<?, ?it/s]

1_Basic Materials_SYNTHOMER PLC_2021.pdf)cos=0.857


  0%|          | 0/1 [00:00<?, ?it/s]

3_Consumer Staples_Imperial Brands PLC_2016.pdf)cos=0.804


  0%|          | 0/1 [00:00<?, ?it/s]

8_Real Estate_RIGHTMOVE PLC_2016.pdf)cos=0.984


  0%|          | 0/1 [00:00<?, ?it/s]

8_Real Estate_RIGHTMOVE PLC_2019.pdf)cos=0.999


Ok, the indications are not clear: sometimes it works, sometimes it does not. What we have is quite stable and make sense, but it seems as we re-invented the wheel. What should I do?   
Probably the first thing should be enlarging the set: so far this was simply intended to be used to check the resilience of the entire process against the removal of few sentences (i.e. the target of the previous notebook).  
Then I should probably arrange the first cleaning to target the output data from pymupdf4llm.   
But since everything is working, should I really care?

# A greater subset to test

## Selecting files

In [41]:
RTF_FOLDER='./rtf/'

In [42]:
TARGET_FOLDER='./NewProcessedData/'

In [43]:
rtf_files=[file for file in os.listdir(RTF_FOLDER) if file.endswith('.rtf')]

In [44]:
rtf_files.sort()

In [45]:
N=len(rtf_files)

In [46]:
selection_10=np.random.choice(rtf_files, size=int(N/10))

In [48]:
selection_10.sort()

In [50]:
selection_01=np.array(['10_Telecommunications_VODAFONE GROUP PLC_2021.rtf',
       '1_Basic Materials_SYNTHOMER PLC_2021.rtf',
       '3_Consumer Staples_Imperial Brands PLC_2016.rtf',
       '8_Real Estate_RIGHTMOVE PLC_2016.rtf',
       '8_Real Estate_RIGHTMOVE PLC_2019.rtf'], dtype='<U67')

In [52]:
len([s10 for s10 in selection_10 if s10 not in selection_01])==len(selection_10)

True

In [53]:
selection_10

array(['10_Telecommunications_BT GROUP PLC_2016.rtf',
       '10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.rtf',
       '10_Telecommunications_VODAFONE GROUP PLC_2018.rtf',
       '10_Telecommunications_VODAFONE GROUP PLC_2023.rtf',
       '11_Utilities_3_SEVERN TRENT PLC_2022.rtf',
       '11_Utilities_8_RENEWI PLC_2023.rtf',
       '1_Basic Materials_CRODA INTERNATIONAL PLC_2015.rtf',
       '1_Basic Materials_EVRAZ PLC_2021.rtf',
       '1_Basic Materials_SYNTHOMER PLC_2016.rtf',
       '1_Basic Materials_SYNTHOMER PLC_2018.rtf',
       '2_Consumer Discretionary_BARRATT DEVELOPMENTS PLC_2021.rtf',
       '2_Consumer Discretionary_BERKELEY GROUP HOLDINGS (THE) PLC_2022.rtf',
       '2_Consumer Discretionary_BURBERRY GROUP PLC_2020.rtf',
       '2_Consumer Discretionary_INTERCONTINENTAL HOTELS GROUP PLC_2019.rtf',
       '2_Consumer Discretionary_RELX PLC_2016.rtf',
       '2_Consumer Discretionary_RELX PLC_2016.rtf',
       '2_Consumer Discretionary_RELX PLC_2017_1.rtf',
   

Actually, I got tired after a while in selecting the various .pdf, therefore I made a selection of the list therein.

## Files

In [54]:
new_pdfs=[file for file in os.listdir(PDF_FOLDER) if file.endswith('.pdf') and file not in pdfs]
new_pdfs.sort()

In [56]:
len(new_pdfs)

17

## From pdf to md

In [58]:
for pdf in tqdm(new_pdfs):
    md_text = pymupdf4llm.to_markdown(PDF_FOLDER+pdf)
    file_name=file2vecfile(pdf, extension='.pdf').replace('.txt', '.md')
    pathlib.Path(MD_FOLDER+file_name).write_bytes(md_text.encode())

  0%|          | 0/17 [00:00<?, ?it/s]

Processing ./pdf/10_Telecommunications_BT GROUP PLC_2016.pdf...
Processing ./pdf/10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.pdf...
Processing ./pdf/10_Telecommunications_VODAFONE GROUP PLC_2018.pdf...
Processing ./pdf/10_Telecommunications_VODAFONE GROUP PLC_2023.pdf...
Processing ./pdf/11_Utilities_3_SEVERN TRENT PLC_2022.pdf...


Processing ./pdf/11_Utilities_8_RENEWI PLC_2023.pdf...
Processing ./pdf/1_Basic Materials_CRODA INTERNATIONAL PLC_2015.pdf...
Processing ./pdf/1_Basic Materials_EVRAZ PLC_2021.pdf...


Processing ./pdf/1_Basic Materials_SYNTHOMER PLC_2016.pdf...
Processing ./pdf/1_Basic Materials_SYNTHOMER PLC_2018.pdf...
Processing ./pdf/2_Consumer Discretionary_BARRATT DEVELOPMENTS PLC_2021.pdf...
Processing ./pdf/2_Consumer Discretionary_BERKELEY GROUP HOLDINGS (THE) PLC_2022.pdf...


Processing ./pdf/2_Consumer Discretionary_BURBERRY GROUP PLC_2020.pdf...
Processing ./pdf/2_Consumer Discretionary_INTERCONTINENTAL HOTELS GROUP PLC_2019.pdf...


Processing ./pdf/2_Consumer Discretionary_RELX PLC_2016.pdf...
Processing ./pdf/2_Consumer Discretionary_RELX PLC_2017.pdf...
Processing ./pdf/2_Consumer Discretionary_RELX PLC_2021.pdf...




## New vectors vs. old vectors

In [None]:
for i_pdf, pdf in enumerate(tqdm(new_pdfs)):
    # read the md file
    with open(MD_FOLDER+file_name, 'r') as f:
        markdown_string = f.read()
    # embed the text
    md_vec=jina4lote(first_text_cleaner(markdown_string))
    
    # go to the old rtf file
    rtf=pdf.replace('.pdf', '.rtf')
    # get the embedding
    vecfile=file2vecfile(rtf)
    rtf_vec=np.genfromtxt(TARGET_FOLDER+vecfile)
    # compare the embeddings
    cos_sim=float(util.cos_sim(rtf_vec, md_vec))
    print(f'{pdf:})cos={cos_sim:.3f}')
    
    # save the md vec
    np.savetxt(MD_FOLDER+'md_'+vecfile, md_vec)

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

10_Telecommunications_BT GROUP PLC_2016.pdf)cos=0.931


  0%|          | 0/6 [00:00<?, ?it/s]

10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.pdf)cos=0.928


  0%|          | 0/6 [00:00<?, ?it/s]

In [60]:
print('gotcha!')

gotcha!


In [63]:
for i_pdf, pdf in enumerate(tqdm(new_pdfs)):
    # go to the old rtf file
    rtf=pdf.replace('.pdf', '.rtf')
    # get the embedding
    vecfile=file2vecfile(rtf)
    rtf_vec=np.genfromtxt(TARGET_FOLDER+vecfile)
    # read the md vec
    md_vec=np.genfromtxt(MD_FOLDER+'md_'+vecfile)
    # compare the embeddings
    cos_sim=float(util.cos_sim(rtf_vec, md_vec))
    print(f'{pdf:})cos={cos_sim:.3f}')

  0%|          | 0/17 [00:00<?, ?it/s]

10_Telecommunications_BT GROUP PLC_2016.pdf)cos=0.931
10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.pdf)cos=0.928
10_Telecommunications_VODAFONE GROUP PLC_2018.pdf)cos=0.918
10_Telecommunications_VODAFONE GROUP PLC_2023.pdf)cos=0.906
11_Utilities_3_SEVERN TRENT PLC_2022.pdf)cos=0.932
11_Utilities_8_RENEWI PLC_2023.pdf)cos=0.910
1_Basic Materials_CRODA INTERNATIONAL PLC_2015.pdf)cos=0.939
1_Basic Materials_EVRAZ PLC_2021.pdf)cos=0.885
1_Basic Materials_SYNTHOMER PLC_2016.pdf)cos=0.907
1_Basic Materials_SYNTHOMER PLC_2018.pdf)cos=0.903
2_Consumer Discretionary_BARRATT DEVELOPMENTS PLC_2021.pdf)cos=0.920
2_Consumer Discretionary_BERKELEY GROUP HOLDINGS (THE) PLC_2022.pdf)cos=0.911
2_Consumer Discretionary_BURBERRY GROUP PLC_2020.pdf)cos=0.900
2_Consumer Discretionary_INTERCONTINENTAL HOTELS GROUP PLC_2019.pdf)cos=0.898
2_Consumer Discretionary_RELX PLC_2016.pdf)cos=0.987
2_Consumer Discretionary_RELX PLC_2017.pdf)cos=0.989
2_Consumer Discretionary_RELX PLC_2021.pdf)cos=0.999
