# Intro

## Modules

In [1]:
import os, sys, pickle
import numpy as np

In [42]:
import shutil

In [46]:
from sentence_transformers import util

In [19]:
from tqdm.notebook import tqdm, trange

## Folders

In [2]:
VECTOR_FOLDER='./NewProcessedData/vectors/'

In [3]:
BENCHMARK_FOLDER='./NewProcessedData/benchmarks/'

In [4]:
PCA_CENTR_VECTOR_FOLDER='./NewProcessedData/abtt5_vectors/'

In [5]:
ABTT1_VECTOR_FOLDER='./NewProcessedData/abtt1_vectors/'

In [6]:
WIKI_FOLDER='./NewProcessedData/wikipedia/vectors/'

In [7]:
WIKI_TEXT_FOLDER='./NewProcessedData/wikipedia/texts/'

In [8]:
TEXT_FOLDER='./NewProcessedData/texts/'

## Vectors

In [23]:
vecfiles=os.listdir(PCA_CENTR_VECTOR_FOLDER)

In [24]:
vecfiles.sort()

### Reports

In [25]:
reports_files=[vecfile for vecfile in vecfiles if vecfile[:4].isnumeric()]

In [26]:
reports_files.sort()

### SDGs

In [37]:
goalvecfiles=[vecfile for vecfile in vecfiles if vecfile.startswith('Goal')]

In [38]:
goalvecfiles.sort()

In [39]:
goalvecfiles

['Goal-01-Fast-Facts.txt',
 'Goal-02-Fast-Facts.txt',
 'Goal-03-Fast-Facts.txt',
 'Goal-04-Fast-Facts.txt',
 'Goal-05-Fast-Facts.txt',
 'Goal-06-Fast-Facts.txt',
 'Goal-07-Fast-Facts.txt',
 'Goal-08-Fast-Facts.txt',
 'Goal-09-Fast-Facts.txt',
 'Goal-10-Fast-Facts.txt',
 'Goal-11_Fast-Facts.txt',
 'Goal-12_Fast-Facts.txt',
 'Goal-13_Fast-Facts.txt',
 'Goal-14_Fast-Facts.txt',
 'Goal-15-Fast-Facts.txt',
 'Goal-16-Fast-Facts.txt',
 'Goal-17-Fast-Facts.txt',
 'Goal-Fast-Facts.txt']

In [40]:
goalvecs=[np.genfromtxt(PCA_CENTR_VECTOR_FOLDER+goalvecfile) for goalvecfile in goalvecfiles]

In [45]:
len(goalvecs)

18

### Wikipedia's vectors

In [114]:
wikivecfiles=[vecfile for vecfile in vecfiles if vecfile.startswith('wiki')]

In [115]:
wikivecfiles.sort()

In [116]:
len(wikivecfiles)

4001

In [117]:
wikivecs=[np.genfromtxt(PCA_CENTR_VECTOR_FOLDER+wikivecfile) for wikivecfile in wikivecfiles]

# Reports

For reports, I am producing 3 files:
1. the vectors;
2. the wikipedia benchmark;
3. the texts' + lengths+cosine similarity vs. SDGs + the concreteness measure

## Reports' vectors

Define the numpy array

In [31]:
vectors=np.zeros(len(reports_files), dtype=[('year', 'i4'), ('sector', 'U2'), ('company', 'U100'), ('vector', object)])

In [32]:
for i_rf, reports_file in enumerate(reports_files):
    # info from the filename
    splitted_filename=reports_file.split('_')
    
    year=int(splitted_filename[0])
    sector=splitted_filename[1]
    if splitted_filename[-1]=='PLC.txt':
        company='_'.join(splitted_filename[2:-1])
    else:
        company='_'.join(splitted_filename[2:])
        company=company.replace('.txt', '')
    company=company.lower()
    # get the vector
    vector=np.genfromtxt(PCA_CENTR_VECTOR_FOLDER+reports_file)
    assert len(vector)==768
    # organize data
    vectors[i_rf]['year']=year
    vectors[i_rf]['sector']=sector
    vectors[i_rf]['company']=company
    vectors[i_rf]['vector']=vector

Save me!

In [82]:
with open('./NewProcessedData/Data2BXported/reports_vectors.pickle', 'wb') as f:
    pickle.dump(vectors, f)

## Cosine against Wikipedia

Since they are too many, they will have to handle everything on their own.

In [84]:
with open('./NewProcessedData/wiki_benchmark_abtt5.pickle', 'rb') as f:
    cacca=pickle.load(f)

Reshaping the file

In [93]:
cacca.dtype

dtype([('cos_sim', 'O')])

In [92]:
rep_wiki=np.zeros(((len(concr), 4001)))

In [100]:
_['cos_sim']

array([-0.02529323, -0.12933999, -0.170446  , ...,  0.05794667,
        0.01123935, -0.00921661])

In [101]:
for i, _ in enumerate(tqdm(cacca)):
    for j, cs in enumerate(_['cos_sim']):
        rep_wiki[i,j]=cs

  0%|          | 0/573 [00:00<?, ?it/s]

In [104]:
with open('./NewProcessedData/Data2BXported/reports_vs_wikipedia.pickle', 'wb') as f:
    pickle.dump(rep_wiki, f)

## Final data

### Text lengths

#### Reports' texts

In [13]:
textfiles=os.listdir(TEXT_FOLDER)

In [14]:
textfiles.sort()

In [15]:
# files from 2024 are un reliable, 
# as they probably refer to a different accounting year,
# due to differences with other countries
len(textfiles)

575

In [16]:
textfiles=[textfile for textfile in textfiles if not textfile.startswith('2024')]

In [17]:
len(textfiles)

573

#### Reports' lengths

In [20]:
l_reports=np.zeros(len(textfiles))
for i_tf, textfile in enumerate(tqdm(textfiles)):
    with open(TEXT_FOLDER+textfile, 'r') as f:
        _text=f.readline()
    l_reports[i_tf]=len(_text)

  0%|          | 0/573 [00:00<?, ?it/s]

### Cosine Similarity against SDGs

In [48]:
cos_sdgs=np.zeros((len(vectors), len(goalvecs)))
for i_vec, vec in enumerate(tqdm(vectors)):
    for j_sdg, sdg in enumerate(goalvecs):
        cos_sdgs[i_vec, j_sdg]=float(util.cos_sim(sdg, vec['vector']))    

  0%|          | 0/573 [00:00<?, ?it/s]

### Concreteness

In [10]:
with open('./NewProcessedData/concreteness_reports.pickle', 'rb') as f:
    concr=pickle.load(f)

In [49]:
concr.dtype

dtype([('year', '<i8'), ('sector', '<U2'), ('company', '<U50'), ('concreteness', '<f8')])

### Gathering all together and saving

In [63]:
dtype_entries=[('year', '<i8'), ('sector', '<U2'), ('company', '<U50'), ('length', '<i8'),('concreteness', '<f8')]

In [64]:
dtype_entries+=[('SDG'+str(_).zfill(2), '<f8') for _ in range(1,18)]

In [65]:
dtype_entries+=[('AllSDGs', '<f8')]

In [66]:
dtype_entries

[('year', '<i8'),
 ('sector', '<U2'),
 ('company', '<U50'),
 ('length', '<i8'),
 ('concreteness', '<f8'),
 ('SDG01', '<f8'),
 ('SDG02', '<f8'),
 ('SDG03', '<f8'),
 ('SDG04', '<f8'),
 ('SDG05', '<f8'),
 ('SDG06', '<f8'),
 ('SDG07', '<f8'),
 ('SDG08', '<f8'),
 ('SDG09', '<f8'),
 ('SDG10', '<f8'),
 ('SDG11', '<f8'),
 ('SDG12', '<f8'),
 ('SDG13', '<f8'),
 ('SDG14', '<f8'),
 ('SDG15', '<f8'),
 ('SDG16', '<f8'),
 ('SDG17', '<f8'),
 ('AllSDGs', '<f8')]

In [68]:
final_data_dtype=np.dtype(dtype_entries)

In [69]:
final_data=np.zeros(len(concr), dtype=final_data_dtype)

In [71]:
concr.dtype.names

('year', 'sector', 'company', 'concreteness')

In [77]:
sdg_names=[name for name in final_data.dtype.names if 'SDG' in name]
sdg_names

['SDG01',
 'SDG02',
 'SDG03',
 'SDG04',
 'SDG05',
 'SDG06',
 'SDG07',
 'SDG08',
 'SDG09',
 'SDG10',
 'SDG11',
 'SDG12',
 'SDG13',
 'SDG14',
 'SDG15',
 'SDG16',
 'SDG17',
 'AllSDGs']

In [78]:
for _ in trange(len(final_data)):
    # copy fron concr
    for name in concr.dtype.names:
        final_data[_][name]=concr[_][name]
    final_data[_]['length']=l_reports[_]
    for j_sdg, sdg in enumerate(sdg_names):
        final_data[_][sdg]=cos_sdgs[_, j_sdg]

  0%|          | 0/573 [00:00<?, ?it/s]

In [83]:
with open('./NewProcessedData/Data2BXported/reports_allin.pickle', 'wb') as f:
    pickle.dump(final_data, f)

# SDGs

Analogously, I am exporting SDG vectors and their similarity against Wikipedia.

## Vectors

In [106]:
sdgs_vectors=np.zeros(len(goalvecfiles), dtype=[('SDG', 'U6'), ('vector', object)])

In [109]:
for i_g, goalvec in enumerate(goalvecs):
    if i_g<17:
        sdgs_vectors[i_g]['SDG']='SDG'+str(i_g+1).zfill(2)
    else:
        sdgs_vectors[i_g]['SDG']='AllSDGs'
    sdgs_vectors[i_g]['vector']=goalvec

In [110]:
sdgs_vectors

array([('SDG01', array([-6.24305378e-03, -5.60462251e-02, -6.79502157e-04,  3.58487483e-03,
               1.84286490e-02,  8.27720153e-03, -9.57212538e-03,  2.21839766e-02,
               1.02020779e-02, -1.48966316e-03,  2.54297603e-02,  1.87023774e-02,
              -1.36060070e-02, -2.08789407e-02, -1.96089991e-02,  1.54714277e-02,
               2.68714112e-02, -1.16651763e-02, -4.95372915e-03,  4.91844762e-02,
               3.05534017e-02,  8.54567917e-03,  1.98892305e-03, -2.77706779e-03,
              -2.02846245e-02,  1.25386178e-02,  1.84497888e-02,  9.46753224e-03,
              -8.06248622e-04, -6.17318489e-03,  1.97421119e-02, -5.85717234e-03,
              -1.24095690e-02,  6.75010572e-03,  5.72537619e-03,  1.47197109e-02,
               1.68969559e-02, -8.11309521e-03, -9.19293453e-03,  7.85790818e-03,
               3.29535683e-02,  2.64164057e-03,  6.20581237e-03, -5.89690142e-03,
              -6.14444507e-03, -8.24265149e-03, -1.03606432e-03,  7.15077510e-03,
      

In [111]:
with open('./NewProcessedData/Data2BXported/sdgs_vectors.pickle', 'wb') as f:
    pickle.dump(sdgs_vectors, f)

## Cosine against Wikipedia

In [118]:
sdg_wiki=np.zeros(((len(sdgs_vectors), 4001)))

In [119]:
for i_gv, goalvec in enumerate(tqdm(goalvecs)):
    for i_w, wikivec in enumerate(tqdm(wikivecs, leave=False)):
        sdg_wiki[i_gv,i_w]=float(util.cos_sim(wikivec, goalvec))

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

In [120]:
with open('./NewProcessedData/Data2BXported/sdgs_vs_wikipedia.pickle', 'wb') as f:
    pickle.dump(sdg_wiki, f)