# Intro

## Standard modules

In [1]:
import os, sys, pickle

In [2]:
import numpy as np

In [3]:
from numpy.linalg import norm

In [4]:
from tqdm.auto import tqdm, trange

In [5]:
import tiktoken
# Carica il tokenizer per text-embedding-ada-002
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

In [6]:
from multiprocessing import Pool

In [7]:
TOKENIZERS_PARALLELISM=True

In [8]:
from hf_token import token

### Check

In [9]:
import inspect

In [16]:
list(sys.modules.keys())

['sys',
 'builtins',
 '_frozen_importlib',
 '_imp',
 '_io',
 'marshal',
 'posix',
 '_frozen_importlib_external',
 '_thread',
 '_weakref',
 'time',
 'zipimport',
 '_codecs',
 'codecs',
 'encodings.aliases',
 'encodings',
 'encodings.utf_8',
 '_signal',
 '__main__',
 'encodings.latin_1',
 '_abc',
 'abc',
 'io',
 '_stat',
 'stat',
 '_collections_abc',
 'genericpath',
 'posixpath',
 'os.path',
 'os',
 '_sitebuiltins',
 '_locale',
 '_bootlocale',
 '_operator',
 'operator',
 'keyword',
 '_heapq',
 'heapq',
 'itertools',
 'reprlib',
 '_collections',
 'collections',
 '_functools',
 'functools',
 'importlib._bootstrap',
 'importlib._bootstrap_external',
 'types',
 'importlib',
 'importlib.machinery',
 'importlib.abc',
 'contextlib',
 'importlib.util',
 '_virtualenv',
 '_distutils_hack',
 'mpl_toolkits',
 'site',
 '_weakrefset',
 'weakref',
 'pkgutil',
 'runpy',
 'enum',
 '_sre',
 'sre_constants',
 'sre_parse',
 'sre_compile',
 'copyreg',
 're',
 'fnmatch',
 'ntpath',
 'errno',
 'urllib',
 'urll

In [17]:
inspect.getsource(sys.modules['lifesafer'])

'import sys\nimport inspect\n\n# Tentare di importare il modulo in memoria\ntry:\n    import embed_me\n    print(inspect.getsource(\'embed_me\'))\nexcept Exception as e:\n    print(f"Errore: {e}")\n'

## Login to hugging face?

In [None]:
from huggingface_hub import login
login()

## Homemade modules

In [13]:
from file2whatever import file2text

ModuleNotFoundError: No module named 'file2whatever'

In [15]:
import lifesafer

Errore: No module named 'embed_me'


## Files

In [9]:
# get the files
rtf_dir="rtf/"

In [10]:
rtf_files=os.listdir(rtf_dir)
rtf_files=[file for file in rtf_files if file.endswith('.rtf')]

In [11]:
rtf_files.sort()

In [12]:
l_files=len(rtf_files)
l_files

592

## Auxiliary functions

In [13]:
def file2tokens(file):
    text=file2text(rtf_dir+file)
    return enc.encode(text)

In [14]:
def file2ntokens(file):
    return len(file2tokens(file))

# Count the total number of token to be embedded

## Functions

In [15]:
def etn(i, num_tokens):
    '''
    extimated token number
    '''
    return num_tokens/(i+1)*l_files

## Run

In [22]:
for i_f, file in enumerate(tqdm(rtf_files)):
    try:
        num_tokens += file2ntokens(file)
        print(f'etn={etn(i_f, num_tokens):.2e}', end='\r')
    except:
        continue

print(f"Numero di token: {num_tokens}")

  0%|          | 0/592 [00:00<?, ?it/s]

Numero di token: 15891796


# Play with tokens

## Prerequisites

In [15]:
cacca=file2tokens(rtf_files[0])

In [16]:
text_0=file2text(rtf_dir+rtf_files[0])

In [17]:
# jina-embeddings-2
max_tokens=8192

In [131]:
def text_butcher(text, max_tokens=max_tokens):
    '''
    Cut the original text in pieces that can be used to feed Jina.
    The strategy is to subsequently cut the text any time it finds a period mark (.)
    compatible with the number of token handled by Jina.
    '''
    chunked_texts=[]
    new_text=text
    while len(new_text)>0:
        cacca=enc.encode(new_text)
        chunked_text=enc.decode(cacca[:max_tokens])
        if len(cacca)<max_tokens:
            # if cacca is smaller than max_tokens, we take everything...
            chunked_texts.append(chunked_text)
            new_text=''#new_text[len(chunked_text):].strip()
        elif '.' in chunked_text:
            # otherwise, cut the text up to the last sentence appropriately
            chunked_text='.'.join(chunked_text.split('.')[:-1])+'.'
            chunked_texts.append(chunked_text)
            new_text=new_text[len(chunked_text):].strip()
        else:
            raise Exception('WTF?!')
    return chunked_texts

### Debug

In [132]:
cacca=text_butcher(text_0, max_tokens=max_tokens)

In [133]:
len(cacca), [len(c) for c in cacca]/np.sum([len(c) for c in cacca])

(3, array([0.44389373, 0.43566893, 0.12043735]))

## From text to tokens and back

In [22]:
text_0

"Transforming lives means creating a sustainable future As an African business, we recognise the opportunities this continent has to offer: the talent of the people it nurtures and the potential for responsible development. But we are also aware of the challenges the communities across the continent face and we’re determined to make In our Annual Report and Accounts 2020/21, we told our stakeholders that the development of our sustainability strategy was one of the most important steps Airtel Africa had ever taken. Since then, the business has been focused on identifying the risks and opportunities that moving to a more sustainable future will bring, and developing the programmes and Message from the Board of directors long-term goals that will guide us. I am proud of the significant work that has gone into producing the sustainability strategy that I am delighted to present to you today. Airtel Africa is a business that is driven by the purpose to ‘transform lives’ – it lies at the he

In [23]:
enc.decode(cacca)==text_0

True

## Jina in action

In [38]:
from sentence_transformers import SentenceTransformer, util

In [39]:
import sentence_transformers

In [40]:
sentence_transformers.__version__

'2.7.0'

In [41]:
model = SentenceTransformer(
    "jinaai/jina-embeddings-v2-base-en", 
    trust_remote_code=True, 
    token=token)

  return torch._C._cuda_getDeviceCount() > 0


In [42]:
model.max_seq_length

8192

Automagically it should divide the text in paragraph and take the average. Let's check...

### Is it true?

#### Embedding the entire text uncut

In [135]:
embedding_uncut = model.encode(text_0, normalize_embeddings=True)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f3e19a70b50>>
Traceback (most recent call last):
  File "/home/sarawalk/sdgs_py38/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f3e19a70b50>>
Traceback (most recent call last):
  File "/home/sarawalk/sdgs_py38/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [None]:
embedding_uncut.shape

In [None]:
# text uncut
with open('./tmp_tuc.pickle', 'wb') as f:
    pickle.dump(embedding_uncut, f)

#### Handmade averaging

Let's handle the text as it should:

In [138]:
embeddings=[]
for cut in tqdm(text_0_butchered):
    embedding = model.encode(cut, normalize_embeddings=True)
    print(len(embedding))
    embeddings.append(embedding)

  0%|          | 0/3 [00:00<?, ?it/s]

768
768
768


In [139]:
text_0_butchered_lens=[len(text) for text in text_0_butchered]

In [140]:
text_0_butchered_lens

[45227, 44389, 12271]

In [141]:
embeddings=np.array(embeddings)

In [142]:
embeddings.shape

(3, 768)

In [143]:
[norm(embedding) for embedding in embeddings]

[1.0, 1.0, 1.0]

In [144]:
avg_embedding=np.average(embeddings, axis=0, weights=text_0_butchered_lens)

In [145]:
avg_embedding/=norm(avg_embedding)

Check

In [146]:
avg_embedding.shape==embedding_uncut.shape

True

In [147]:
norm(avg_embedding)

1.0

In [148]:
# text with handmade average
with open('./tmp_twha.pickle', 'wb') as f:
    pickle.dump(avg_embedding, f)

In [149]:
# all texts' embeddings
with open('./tmp_ate.pickle', 'wb') as f:
    pickle.dump(embeddings, f)

#### Cosine Similarity, dot score and other checks

##### Load

In [167]:
# text uncut
with open('./tmp_tuc.pickle', 'rb') as f:
    embedding_uncut=pickle.load(f)
# all text's embeddings
with open('./tmp_ate.pickle', 'rb') as f:
    embeddings=pickle.load(f)
# text with handmade average
with open('./tmp_twha.pickle', 'rb') as f:
    avg_embedding=pickle.load(f)

##### Jina automated 

They should not be similar, but let's check:

In [168]:
embedding_0=embeddings[0]

In [169]:
norm(embedding_uncut), norm(embedding_0)

(1.0, 1.0)

Since all vectors are normalized, there is no difference between the cosine similarity and dot similarity (i.e. the dot product):

In [170]:
util.dot_score(embedding_uncut, embedding_0)

tensor([[1.0000]])

In [171]:
util.cos_sim(embedding_uncut, embedding_0)

tensor([[1.0000]])

Still, they are the same, gosh... Therefore, Jina cuts up to 8192 tokens and that's it. 

##### Fabio's average

In [212]:
avg_embedding=np.average(embeddings, axis=0, weights=text_0_butchered_lens)

In [213]:
norm(avg_embedding)

0.9894095642414317

The average is not normalized (as it should, since normalization is quadratic, while the weighted average is linear).

In [214]:
util.dot_score(embedding_0.astype('f8'), avg_embedding.astype('f8'))

tensor([[0.9809]], dtype=torch.float64)