# Intro

## Standard modules

In [1]:
import os, sys, pickle
import pathlib
import numpy as np
from tqdm.auto import tqdm, trange

In [2]:
from sentence_transformers import util

## My modules

In [25]:
from toolbox.updater import time_is_now

In [3]:
from file2text import file2text

In [4]:
from first_rough_cleaning import first_text_cleaner

In [5]:
from embed_me_II_second_wipe import file2vecfile

  return torch._C._cuda_getDeviceCount() > 0


In [6]:
from jina4lote import jina4lote

## Alternatives

In [7]:
import pymupdf4llm

## Parameters

In [9]:
N_RUNS=10

# Subset to test

In [10]:
RTF_FOLDER='./rtf/'

In [11]:
TARGET_FOLDER='./NewProcessedData/'

In [12]:
rtf_files=[file for file in os.listdir(RTF_FOLDER) if file.endswith('.rtf')]

In [13]:
rtf_files.sort()

In [14]:
N=len(rtf_files)

In [15]:
selection_01=np.random.choice(rtf_files, size=int(N/100))

In [16]:
selection_01.sort()

In [17]:
selection_01

array(['10_Telecommunications_VODAFONE GROUP PLC_2021.rtf',
       '1_Basic Materials_SYNTHOMER PLC_2021.rtf',
       '3_Consumer Staples_Imperial Brands PLC_2016.rtf',
       '8_Real Estate_RIGHTMOVE PLC_2016.rtf',
       '8_Real Estate_RIGHTMOVE PLC_2019.rtf'], dtype='<U67')

## Get both the text and the vector

In [18]:
texts_01={}
for rtf in tqdm(selection_01):
    file_name=rtf.replace('.rtf', '')
    texts_01[file_name]={}
    long_text=file2text(RTF_FOLDER+rtf)
    texts_01[file_name]['text']=long_text
    vecfile=file2vecfile(rtf)
    texts_01[file_name]['vec']=np.genfromtxt(TARGET_FOLDER+vecfile)

  0%|          | 0/5 [00:00<?, ?it/s]

In [19]:
with open('./Check/text_and_vector_sample_01.pickle', 'wb') as f:
    pickle.dump(texts_01, f)

# What is the sense of cosine similarity?

In [None]:
with open('./Check/text_and_vector_sample_01.pickle', 'rb') as f:
    texts_01=pickle.load(f)

## Remove 1% of the sentences

### Function

In [20]:
def sentence_remover(text, fraction):
    assert fraction<1 and fraction>0
    _sentences=[sentence.strip() for sentence in text.split('.')]
    death_list=np.random.choice(len(_sentences), size=int(fraction*len(_sentences)))
    _survivors=[sentence for i, sentence in enumerate(_sentences) if i not in death_list]
    return '. '.join(_survivors)

### Run!

In [21]:
keys=list(texts_01.keys())

In [None]:
for key in tqdm(keys):
    _text=texts_01[key]['text']
    _vec=texts_01[key]['vec']
    coss=np.zeros(N_RUNS)
    for _ in trange(N_RUNS, leave=False):
        _new_text=sentence_remover(_text, .01)
        _new_vec=jina4lote(_new_text)
        coss[_]=float(util.cos_sim(_vec, _new_vec))
    texts_01[key]['cos_sample_001']=coss

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [24]:
for key in keys:
    cos_mean=np.mean(texts_01[key]['cos_sample'])
    cos_std=np.std(texts_01[key]['cos_sample'])
    print(f'{key:} cos_mean={cos_mean:.3f}, cos_std={cos_std:.1e}')

10_Telecommunications_VODAFONE GROUP PLC_2021 cos_mean=1.000, cos_std=1.2e-05
1_Basic Materials_SYNTHOMER PLC_2021 cos_mean=0.997, cos_std=4.3e-05
3_Consumer Staples_Imperial Brands PLC_2016 cos_mean=1.000, cos_std=7.6e-05
8_Real Estate_RIGHTMOVE PLC_2016 cos_mean=1.000, cos_std=0.0e+00
8_Real Estate_RIGHTMOVE PLC_2019 cos_mean=1.000, cos_std=1.5e-05


O.O

### Save

In [27]:
with open('./Check/text_and_vector_sample_01.pickle', 'wb') as f:
    pickle.dump(texts_01, f)

## Remove 10% of the sentences

### Run!

In [31]:
time_is_now()

'[2025-04-01 09:19:07]'

In [None]:
print(time_is_now()+'\tstarting...')
for key in tqdm(keys, leave=False):
    print(time_is_now()+f'\t{key:}')
    _text=texts_01[key]['text']
    _vec=texts_01[key]['vec']
    coss=np.zeros(N_RUNS)
    for _ in trange(N_RUNS, leave=False):
        _new_text=sentence_remover(_text, .1)
        _new_vec=jina4lote(_new_text)
        coss[_]=float(util.cos_sim(_vec, _new_vec))
    texts_01[key]['cos_sample_010']=coss

[2025-04-01 09:21:17]	starting...


  0%|          | 0/5 [00:00<?, ?it/s]

[2025-04-01 09:21:17]	10_Telecommunications_VODAFONE GROUP PLC_2021


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
for key in keys:
    cos_mean=np.mean(texts_01[key]['cos_sample'])
    cos_std=np.std(texts_01[key]['cos_sample'])
    print(f'{key:} cos_mean={cos_mean:.3f}, cos_std={cos_std:.1e}')

O.O