# Intro

## Standard modules

In [13]:
import os, sys

In [21]:
from tqdm.notebook import tqdm, trange

In [2]:
from sentence_transformers import SentenceTransformer, util

In [3]:
import numpy as np

## Homemade modules

In [4]:
from file2text import file2text, file2roughtext

In [5]:
from jina4lote import jina4lote

  return torch._C._cuda_getDeviceCount() > 0


# First entry

In [6]:
rtf_folder='./rtf/'
files=os.listdir(rtf_folder)

In [7]:
files=[file for file in files if file.endswith('.rtf')]
files.sort()

In [8]:
files[0]

'10_Telecommunications_AIRTEL AFRICA PLC_2021.rtf'

## Standard cleaning

In [9]:
long_text=file2text(rtf_folder+files[0])

In [10]:
vec0=jina4lote(long_text)

  0%|          | 0/3 [00:00<?, ?it/s]

## Alternative cleaning

### Function

In [13]:
def alt_clean(long_text):
    no_t_text=long_text.replace('\t', '')
    sentences=no_t_text.split('.')
    new_text=[]
    for sentence in sentences:
        chunks=[s for s in sentence.split('\n') if len(s)>0]
        if len(chunks)>1 or ' ' in chunks[-1]:
            if chunks[-1][0].isupper():
                # the last element is a sentence
                new_text.append(chunks[-1].strip()+'.')
            else:
                # the sentence was cut in several pieces: then look for the first capital letter
                counter=len(chunks)-1
                while counter>0 and chunks[counter].islower():
                    counter-=1
            
                _new_sentence=' '.join([c.strip() for c in chunks[counter:]])+'.'
                new_text.append(_new_sentence)
    return ' '.join(new_text)

### Run!

In [14]:
rough_long_text=file2roughtext(rtf_folder+files[0])

In [15]:
alt_long_text=alt_clean(rough_long_text)

In [21]:
alt_long_text

"As an African business, we recognise the opportunities this continent has to offer: the talent of the people it nurtures and the potential for responsible development. But we are also aware of the challenges the communities across the continent face and we’re determined to make a positive impact. In our Annual Report and Accounts 2020/21, we told our stakeholders that the development of our sustainability strategy was one of the most important steps Airtel Africa had ever taken. 5  Our stakeholder engagement long-term goals that will guide us. significant work that has gone into producing the sustainability strategy that I am delighted to present to you today. Airtel Africa is a business that is driven by the purpose to ‘transform lives’ – it lies at the heart of every decision we have ever made and is a genuine passion of my own. As an African business, we recognise the opportunities this continent has to offer: the talent of the people it nurtures and the potential for responsible d

Actually, it is much worse than the other.

In [16]:
alt_vec0=jina4lote(alt_long_text)

  0%|          | 0/2 [00:00<?, ?it/s]

## Final comparison

In [11]:
previous_embedding=np.genfromtxt('./ProcessedData/2021_10_AIRTEL_AFRICA_PLC.txt', dtype='f8')

In [12]:
float(util.cos_sim(vec0, previous_embedding))

0.9990540664002608

In [19]:
float(util.cos_sim(alt_vec0, previous_embedding))

0.998503983753686

In [20]:
float(util.cos_sim(alt_vec0, vec0))

0.9985707613464394

# All of them!

In [27]:
old_folder='./ProcessedData/'
new_folder='./NewProcessedData/'

In [28]:
old_files=os.listdir(old_folder)
new_files=os.listdir(new_folder)

In [29]:
old_files.sort()

In [30]:
cos_sims=[]
for file in tqdm(old_files):
    if file in new_files:
        old_vec=np.genfromtxt(old_folder+file)
        new_vec=np.genfromtxt(new_folder+file)
        _cs=float(util.cos_sim(old_vec, new_vec))
        print(f'{file:40}\tcos-sim={_cs:.4f}')
        cos_sims.append(_cs)

  0%|          | 0/190 [00:00<?, ?it/s]

2015_01_ANGLO_AMERICAN_PLC.txt          	cos-sim=0.9998
2015_10_BT_GROUP_PLC.txt                	cos-sim=0.9984
2015_10_VODAFONE_GROUP_PLC.txt          	cos-sim=0.9998
2016_01_ANGLO_AMERICAN_PLC.txt          	cos-sim=0.9998
2016_10_BT_GROUP_PLC.txt                	cos-sim=0.9983
2016_10_VODAFONE_GROUP_PLC.txt          	cos-sim=0.9999
2016_11_GOOD_ENERGY_GROUP_PLC.txt       	cos-sim=0.9994
2016_11_SSE_PLC.txt                     	cos-sim=0.9992
2017_01_ANGLO_AMERICAN_PLC.txt          	cos-sim=0.9997
2017_10_VODAFONE_GROUP_PLC.txt          	cos-sim=0.9995
2017_11_GOOD_ENERGY_GROUP_PLC.txt       	cos-sim=0.9989
2017_11_RENEWI_PLC.txt                  	cos-sim=0.9984
2017_11_SSE_PLC.txt                     	cos-sim=0.9995
2018_01_ANGLO_AMERICAN_PLC.txt          	cos-sim=0.9998
2018_10_BT_GROUP_PLC.txt                	cos-sim=0.9986
2018_10_SPIRENT_COMMUNICATIONS_PLC.txt  	cos-sim=0.9996
2018_10_VODAFONE_GROUP_PLC.txt          	cos-sim=0.9999
2018_11_CENTRICA_PLC.txt                	cos-sim

In [31]:
min(cos_sims)

0.9863803088162657