# Intro

## Standard modules

In [2]:
import os, sys, pickle
import pathlib
import numpy as np
from tqdm.auto import tqdm, trange

In [3]:
from sentence_transformers import util

## My modules

In [4]:
from toolbox.updater import time_is_now

In [5]:
from toolbox.email_sender import email_sender

In [6]:
from file2text import file2text

In [7]:
from first_rough_cleaning import first_text_cleaner

In [8]:
from file_handler import *

In [9]:
from jina4lote import jina4lote

  return torch._C._cuda_getDeviceCount() > 0


## Alternatives

In [10]:
import pymupdf4llm

## Parameters

In [11]:
N_RUNS=10

## Folders

In [12]:
PDF_FOLDER='./pdf/'

In [13]:
RTF_FOLDER='./rtf/'

In [14]:
TARGET_FOLDER='./NewProcessedData/'

## Files

In [15]:
pdfs=[file for file in os.listdir(PDF_FOLDER) if file.endswith('.pdf')]
pdfs.sort()

In [16]:
pdfs

['10_Telecommunications_BT GROUP PLC_2016.pdf',
 '10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.pdf',
 '10_Telecommunications_VODAFONE GROUP PLC_2018.pdf',
 '10_Telecommunications_VODAFONE GROUP PLC_2021.pdf',
 '10_Telecommunications_VODAFONE GROUP PLC_2023.pdf',
 '11_Utilities_3_SEVERN TRENT PLC_2022.pdf',
 '11_Utilities_8_RENEWI PLC_2023.pdf',
 '1_Basic Materials_CRODA INTERNATIONAL PLC_2015.pdf',
 '1_Basic Materials_EVRAZ PLC_2021.pdf',
 '1_Basic Materials_SYNTHOMER PLC_2016.pdf',
 '1_Basic Materials_SYNTHOMER PLC_2018.pdf',
 '1_Basic Materials_SYNTHOMER PLC_2021.pdf',
 '2_Consumer Discretionary_BARRATT DEVELOPMENTS PLC_2021.pdf',
 '2_Consumer Discretionary_BERKELEY GROUP HOLDINGS (THE) PLC_2022.pdf',
 '2_Consumer Discretionary_BURBERRY GROUP PLC_2020.pdf',
 '2_Consumer Discretionary_INTERCONTINENTAL HOTELS GROUP PLC_2019.pdf',
 '2_Consumer Discretionary_RELX PLC_2016.pdf',
 '2_Consumer Discretionary_RELX PLC_2017.pdf',
 '2_Consumer Discretionary_RELX PLC_2021.pdf',
 '3_Cons

# pdf + md vs. rtf

In [17]:
MD_FOLDER='./markdown/'

## From pdf to md

In [20]:
for pdf in tqdm(pdfs):
    md_text = pymupdf4llm.to_markdown(PDF_FOLDER+pdf)
    file_name=file2vecfile(pdf, extension='.pdf').replace('.txt', '.md')
    pathlib.Path(MD_FOLDER+file_name).write_bytes(md_text.encode())

  0%|          | 0/5 [00:00<?, ?it/s]

Processing ./pdf/10_Telecommunications_VODAFONE GROUP PLC_2021.pdf...
Processing ./pdf/1_Basic Materials_SYNTHOMER PLC_2021.pdf...
Processing ./pdf/3_Consumer Staples_Imperial Brands PLC_2016.pdf...
Processing ./pdf/8_Real Estate_RIGHTMOVE PLC_2016.pdf...
Processing ./pdf/8_Real Estate_RIGHTMOVE PLC_2019.pdf...


# New vectors vs. old vectors

In [18]:
for i_pdf, pdf in enumerate(tqdm(pdfs)):
    # get the md file name
    file_name=file2vecfile(pdf, extension='.pdf').replace('.txt', '.md')
    # read the md file
    with open(MD_FOLDER+file_name, 'r') as f:
        markdown_string = f.read()
    # embed the text
    md_vec=jina4lote(first_text_cleaner(markdown_string))
    
    # go to the old rtf file
    rtf=pdf.replace('.pdf', '.rtf')
    # get the embedding
    vecfile=file2vecfile(rtf)
    rtf_vec=np.genfromtxt(TARGET_FOLDER+'/vectors/'+vecfile)
    # compare the embeddings
    cos_sim=float(util.cos_sim(rtf_vec, md_vec))
    print(f'{pdf:})cos={cos_sim:.3f}')
    
    # save the md vec
    np.savetxt('md_'+vecfile, md_vec)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

10_Telecommunications_BT GROUP PLC_2016.pdf)cos=0.997


  0%|          | 0/4 [00:00<?, ?it/s]

10_Telecommunications_SPIRENT COMMUNICATIONS PLC_2023.pdf)cos=0.999


  0%|          | 0/7 [00:00<?, ?it/s]

10_Telecommunications_VODAFONE GROUP PLC_2018.pdf)cos=0.999


  0%|          | 0/3 [00:00<?, ?it/s]

10_Telecommunications_VODAFONE GROUP PLC_2021.pdf)cos=0.999


  0%|          | 0/2 [00:00<?, ?it/s]

10_Telecommunications_VODAFONE GROUP PLC_2023.pdf)cos=0.999


  0%|          | 0/5 [00:00<?, ?it/s]

11_Utilities_3_SEVERN TRENT PLC_2022.pdf)cos=0.999


  0%|          | 0/3 [00:00<?, ?it/s]

11_Utilities_8_RENEWI PLC_2023.pdf)cos=0.998


  0%|          | 0/3 [00:00<?, ?it/s]

1_Basic Materials_CRODA INTERNATIONAL PLC_2015.pdf)cos=0.996


  0%|          | 0/6 [00:00<?, ?it/s]

1_Basic Materials_EVRAZ PLC_2021.pdf)cos=0.999


  0%|          | 0/3 [00:00<?, ?it/s]

1_Basic Materials_SYNTHOMER PLC_2016.pdf)cos=0.998


  0%|          | 0/3 [00:00<?, ?it/s]

1_Basic Materials_SYNTHOMER PLC_2018.pdf)cos=0.998


  0%|          | 0/4 [00:00<?, ?it/s]

1_Basic Materials_SYNTHOMER PLC_2021.pdf)cos=0.999


  0%|          | 0/8 [00:00<?, ?it/s]

2_Consumer Discretionary_BARRATT DEVELOPMENTS PLC_2021.pdf)cos=1.000


  0%|          | 0/2 [00:00<?, ?it/s]

2_Consumer Discretionary_BERKELEY GROUP HOLDINGS (THE) PLC_2022.pdf)cos=0.998


  0%|          | 0/7 [00:00<?, ?it/s]

2_Consumer Discretionary_BURBERRY GROUP PLC_2020.pdf)cos=0.999


  0%|          | 0/3 [00:00<?, ?it/s]

2_Consumer Discretionary_INTERCONTINENTAL HOTELS GROUP PLC_2019.pdf)cos=0.998


  0%|          | 0/4 [00:00<?, ?it/s]

2_Consumer Discretionary_RELX PLC_2016.pdf)cos=0.998


  0%|          | 0/4 [00:00<?, ?it/s]

2_Consumer Discretionary_RELX PLC_2017.pdf)cos=0.997


  0%|          | 0/6 [00:00<?, ?it/s]

2_Consumer Discretionary_RELX PLC_2021.pdf)cos=0.999


  0%|          | 0/1 [00:00<?, ?it/s]

3_Consumer Staples_Imperial Brands PLC_2016.pdf)cos=0.993


  0%|          | 0/1 [00:00<?, ?it/s]

8_Real Estate_RIGHTMOVE PLC_2016.pdf)cos=0.999


  0%|          | 0/1 [00:00<?, ?it/s]

8_Real Estate_RIGHTMOVE PLC_2019.pdf)cos=0.999


In [19]:
email_sender('[SDGs] Comparison between different strategies of text extraction', 'Check the results, but differences seem to be quite limited.')