# Intro

## Standard modules

In [1]:
import os, sys, pickle
import pathlib
import requests

In [2]:
import numpy as np

In [3]:
from tqdm.auto import tqdm, trange

In [4]:
from sentence_transformers import util

In [5]:
import matplotlib.pyplot as plt
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['xtick.major.size'] = 10
plt.rcParams['xtick.major.width'] = 2
plt.rcParams['ytick.major.size'] = 10
plt.rcParams['ytick.major.width'] = 2

plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14

plt.rcParams['xtick.minor.size'] = 5
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.minor.size'] = 5
plt.rcParams['ytick.minor.width'] = 1

In [6]:
from sklearn.decomposition import PCA

In [7]:
import seaborn

In [8]:
import pymannkendall as mk

In [9]:
from scipy.spatial.distance import cosine

In [10]:
import requests
import re
from bs4 import BeautifulSoup  # Per pulizia HTML nelle REST API

In [11]:
import datetime as dt

## Personal modules

In [12]:
from toolbox.updater import time_is_now

In [13]:
from file2text import file2text

In [14]:
from first_rough_cleaning import first_text_cleaner

In [15]:
from file_handler import *

In [16]:
from jina4lote import jina4lote

  return torch._C._cuda_getDeviceCount() > 0


In [17]:
from tabulate import tabulate

## Folders

In [18]:
VECTOR_FOLDER='./NewProcessedData/vectors/'

In [19]:
BENCHMARK_FOLDER='./NewProcessedData/benchmarks/'

In [20]:
ABTT1_VECTOR_FOLDER='./NewProcessedData/abtt1_vectors/'

## Load vectors

In [21]:
vecfiles=os.listdir(ABTT1_VECTOR_FOLDER)

In [22]:
vecfiles.sort()

### Reports' vectors

In [23]:
reports_files=[vecfile for vecfile in vecfiles if vecfile[:4].isnumeric()]

In [24]:
reports_files.sort()

In [25]:
years, k_y=np.unique([rf[:4] for rf in reports_files], return_counts=True)

In [26]:
np.vstack((years, k_y)).T

array([['2015', '40'],
       ['2016', '44'],
       ['2017', '51'],
       ['2018', '56'],
       ['2019', '67'],
       ['2020', '69'],
       ['2021', '76'],
       ['2022', '80'],
       ['2023', '90']], dtype='<U21')

In [27]:
vectors=np.zeros(len(reports_files), dtype=[('year', 'i4'), ('sector', 'U2'), ('company', 'U100'), ('vector', object)])

In [28]:
for i_rf, reports_file in enumerate(reports_files):
    # info from the filename
    splitted_filename=reports_file.split('_')
    
    year=int(splitted_filename[0])
    sector=splitted_filename[1]
    if splitted_filename[-1]=='PLC.txt':
        company='_'.join(splitted_filename[2:-1])
    else:
        company='_'.join(splitted_filename[2:])
        company=company.replace('.txt', '')
    company=company.lower()
    # get the vector
    vector=np.genfromtxt(ABTT1_VECTOR_FOLDER+reports_file)
    
    # organize data
    vectors[i_rf]['year']=year
    vectors[i_rf]['sector']=sector
    vectors[i_rf]['company']=company
    vectors[i_rf]['vector']=vector

### Reports aggregated per sector per year

For each and sector I have an only entry, therefore the final vector has 

In [29]:
_tracks=np.unique(vectors[['year', 'sector']])
l_aggr_rep=len(_tracks)
l_aggr_rep

96

entries.

In [30]:
aggr_vectors=np.zeros(l_aggr_rep, dtype=[('year', 'i4'), ('sector', 'U2'), ('vector', object)])

In [31]:
for i_t, _track in enumerate(tqdm(_tracks)):
    year=_track[0]
    sector=_track[1]
    # select entries in vectors
    _mask=np.logical_and(vectors['year']==year, vectors['sector']==sector)
    _selection=vectors[_mask]
    # define te entries in aggr_vectors
    aggr_vectors[i_t]['year']=year
    aggr_vectors[i_t]['sector']=sector
    aggr_vectors[i_t]['vector']=np.mean(_selection['vector'])

  0%|          | 0/96 [00:00<?, ?it/s]

### Witten's vector

In [32]:
wittenvecfile=[file for file in os.listdir(ABTT1_VECTOR_FOLDER) if file.startswith('Witten') and file.endswith('.txt')][0]

In [33]:
wittenvec=np.genfromtxt(ABTT1_VECTOR_FOLDER+wittenvecfile)

### Goals' vectors

In [34]:
goalvecfiles=[vecfile for vecfile in vecfiles if vecfile.startswith('Goal')]

In [35]:
goalvecfiles.sort()

In [36]:
goalvecfiles

['Goal-01-Fast-Facts.txt',
 'Goal-02-Fast-Facts.txt',
 'Goal-03-Fast-Facts.txt',
 'Goal-04-Fast-Facts.txt',
 'Goal-05-Fast-Facts.txt',
 'Goal-06-Fast-Facts.txt',
 'Goal-07-Fast-Facts.txt',
 'Goal-08-Fast-Facts.txt',
 'Goal-09-Fast-Facts.txt',
 'Goal-10-Fast-Facts.txt',
 'Goal-11_Fast-Facts.txt',
 'Goal-12_Fast-Facts.txt',
 'Goal-13_Fast-Facts.txt',
 'Goal-14_Fast-Facts.txt',
 'Goal-15-Fast-Facts.txt',
 'Goal-16-Fast-Facts.txt',
 'Goal-17-Fast-Facts.txt',
 'Goal-Fast-Facts.txt']

In [37]:
goalvecs=[np.genfromtxt(ABTT1_VECTOR_FOLDER+goalvecfile) for goalvecfile in goalvecfiles]

### Wikipedia's vectors

In [38]:
wikivecfiles=[vecfile for vecfile in vecfiles if vecfile.startswith('wiki')]

In [39]:
wikivecfiles.sort()

In [40]:
len(wikivecfiles)

4001

In [41]:
wikivecs=[np.genfromtxt(ABTT1_VECTOR_FOLDER+wikivecfile) for wikivecfile in wikivecfiles]

## Sector helper

In [42]:
rtf_files=os.listdir('./rtf')
rtf_files.sort()

In [43]:
sec_helper={}
for file in rtf_files:
    splitted_name=file.split('_')
    if splitted_name[0].isnumeric():
        if splitted_name[0] not in sec_helper.keys():
            sec_helper[splitted_name[0].zfill(2)]=splitted_name[1]

In [44]:
sec_helper

{'10': 'Telecommunications',
 '11': 'Utilities',
 '01': 'Basic Materials',
 '02': 'Consumer Discretionary',
 '03': 'Consumer Staples',
 '04': 'Energy',
 '05': 'Financials',
 '06': 'Health Care',
 '07': 'Industrials',
 '08': 'Real Estate',
 '09': 'Technology'}

## SDG helper

In [45]:
sdg_helper={'01':'No Poverty', 
 '02': 'Zero hunger', 
 '03': 'Good health and well-being', 
 '04': 'Quality education',
 '05': 'Gender equality',
 '06': 'Clean water and sanitation',
 '07': 'Affordable and clean energy',
 '08': 'Decent work and economic growth',
 '09': 'Industry, Innovation, Technology and Infrastructure',
 '10': 'Reduced inequality',
 '11': 'Sustainable cities and communities',
 '12': 'Responsible consumption and production',
 '13': 'Climate action',
 '14': 'Life below water',
 '15': 'Life on land',
 '16': 'Peace, justice and strong institutions',
 '17': 'Partnerships for the goals'
}

## FDR

In [46]:
def fdr_th(p_vals, alpha):
    _aux=np.sort(p_vals)
    fdr_ths=np.arange(1, 1+len(_aux))*alpha/len(_aux)
    fdr_mask=_aux<=fdr_ths
    if np.sum(fdr_mask)>0:
        return fdr_ths[fdr_mask][-1]
    else:
        return 0.

# The benchmark

In [47]:
years=years.astype('i4')

## Creating the benchmark, at the firm level _[DO NOT EVALUATE IT AGAIN!!!]_

In [48]:
wiki_bench=np.zeros(len(vectors), dtype=[('cos_sim', object)])

In [49]:
for i_v in trange(len(vectors)):
    wiki_bench[i_v]['cos_sim']=np.zeros(len(wikivecs))
    for i_w, wikivec in enumerate(tqdm(wikivecs, leave=False)):
        wiki_bench[i_v]['cos_sim'][i_w]=float(util.cos_sim(wikivec, vectors[i_v]['vector']))

  0%|          | 0/573 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

In [50]:
with open('./NewProcessedData/wiki_benchmark_abtt1.pickle', 'wb') as f:
    pickle.dump(wiki_bench, f)

## Creating the benchmark, at the sector level _[DO NOT EVALUATE IT AGAIN!!!]_

In [51]:
wiki_bench_per_sector=np.zeros(l_aggr_rep, dtype=[('cos_sim', object)])

In [52]:
for i_v in trange(l_aggr_rep):
    wiki_bench_per_sector[i_v]['cos_sim']=np.zeros(len(wikivecs))
    for i_w, wikivec in enumerate(tqdm(wikivecs, leave=False)):
        wiki_bench_per_sector[i_v]['cos_sim'][i_w]=float(util.cos_sim(wikivec, aggr_vectors[i_v]['vector']))

  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

  0%|          | 0/4001 [00:00<?, ?it/s]

In [53]:
with open('./NewProcessedData/wiki_bench_per_sector_abtt1.pickle', 'wb') as f:
    pickle.dump(wiki_bench_per_sector, f)

# Validation

In [54]:
alpha=0.05

## All SDGs

In [55]:
with open('./NewProcessedData/wiki_benchmark_abtt1.pickle', 'rb') as f:
    wiki_bench=pickle.load(f)

In [56]:
cos_sims_global=np.zeros(len(years), dtype=[('mean', 'f8'), ('all', object), ('p_val', object)])

In [57]:
for i_y, year in enumerate(tqdm(years)):
    vecs_y=vectors[vectors['year']==year]['vector']
    wiki_bench_y=wiki_bench[vectors['year']==year]['cos_sim']
    cos_sims_global[i_y]['all']=np.zeros(len(vecs_y))
    cos_sims_global[i_y]['p_val']=np.zeros(len(vecs_y))
    for i_vec, vec in enumerate(tqdm(vecs_y, leave=False)):
        _cos_sim=float(util.cos_sim(goalvecs[-1], vec))
        cos_sims_global[i_y]['all'][i_vec]=_cos_sim
        cos_sims_global[i_y]['p_val'][i_vec]=np.sum(wiki_bench_y[i_vec]>=_cos_sim)/len(wikivecs)
    cos_sims_global[i_y]['mean']=np.mean(cos_sims_global[i_y]['all'])

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/56 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/69 [00:00<?, ?it/s]

  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/90 [00:00<?, ?it/s]

In [58]:
for i_y, year in enumerate(tqdm(years)):
    _fdr_th=fdr_th(cos_sims_global[i_y]['p_val'], alpha)
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    validated_compa=compa_y[cos_sims_global[i_y]['p_val']<=_fdr_th]
    validated_sec=sec_y[cos_sims_global[i_y]['p_val']<=_fdr_th]
    print(f'{year:}) len(validated_companies)={len(validated_compa):}')
    if len(validated_compa)>0:
        for i_vc, vc in enumerate(validated_compa):
            print(f'\t\t{vc:25} sector={sec_helper[validated_sec[i_vc]]:}')

  0%|          | 0/9 [00:00<?, ?it/s]

2015) len(validated_companies)=3
		unilever                  sector=Consumer Staples
		standard_chartered        sector=Financials
		halma                     sector=Industrials
2016) len(validated_companies)=3
		unilever                  sector=Consumer Staples
		standard_chartered        sector=Financials
		halma                     sector=Industrials
2017) len(validated_companies)=1
		halma                     sector=Industrials
2018) len(validated_companies)=0
2019) len(validated_companies)=0
2020) len(validated_companies)=7
		intercontinental_hotels_group sector=Consumer Discretionary
		reckitt_benckiser_group   sector=Consumer Staples
		bp                        sector=Energy
		hsbc_holdings             sector=Financials
		standard_chartered        sector=Financials
		astrazeneca               sector=Health Care
		rentokil_initial          sector=Industrials
2021) len(validated_companies)=6
		reckitt_benckiser_group   sector=Consumer Staples
		hsbc_holdings             sector=Fin

### Table for $\LaTeX$

In [59]:
_table=[]
for i_y, year in enumerate(tqdm(years)):
    _fdr_th=fdr_th(cos_sims_global[i_y]['p_val'], alpha)
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    validated_compa=compa_y[cos_sims_global[i_y]['p_val']<=_fdr_th]
    validated_sec=sec_y[cos_sims_global[i_y]['p_val']<=_fdr_th]
    print(f'{year:}) len(validated_companies)={len(validated_compa):}')
    if len(validated_compa)>0:
        for i_vc, vc in enumerate(validated_compa):
            company_name_smooth=vc.split('_')
            company_name_smooth=[_.capitalize() for _ in company_name_smooth]
            company_name_smooth=' '.join(company_name_smooth)
            if i_vc==0:
                _table.append([year, company_name_smooth, sec_helper[validated_sec[i_vc]]])
            else:
                _table.append(['', company_name_smooth, sec_helper[validated_sec[i_vc]]])
    else:
        _table.append([year, '', ''])

  0%|          | 0/9 [00:00<?, ?it/s]

2015) len(validated_companies)=3
2016) len(validated_companies)=3
2017) len(validated_companies)=1
2018) len(validated_companies)=0
2019) len(validated_companies)=0
2020) len(validated_companies)=7
2021) len(validated_companies)=6
2022) len(validated_companies)=4
2023) len(validated_companies)=0


In [60]:
header=['Year','Firm', 'Sector']

In [61]:
print(tabulate(_table, header, tablefmt="latex"))

\begin{tabular}{lll}
\hline
 Year   & Firm                          & Sector                 \\
\hline
 2015   & Unilever                      & Consumer Staples       \\
        & Standard Chartered            & Financials             \\
        & Halma                         & Industrials            \\
 2016   & Unilever                      & Consumer Staples       \\
        & Standard Chartered            & Financials             \\
        & Halma                         & Industrials            \\
 2017   & Halma                         & Industrials            \\
 2018   &                               &                        \\
 2019   &                               &                        \\
 2020   & Intercontinental Hotels Group & Consumer Discretionary \\
        & Reckitt Benckiser Group       & Consumer Staples       \\
        & Bp                            & Energy                 \\
        & Hsbc Holdings                 & Financials             \\
        & Sta

## All SDGs, per sector

In [62]:
cos_sims_global_sec=np.zeros(len(years), dtype=[('all', object), ('p_val', object)])

In [63]:
with open('./NewProcessedData/wiki_bench_per_sector_abtt1.pickle', 'rb') as f:
    wiki_bench_per_sector=pickle.load(f)

In [64]:
for i_y, year in enumerate(tqdm(years)):
    vecs_y=aggr_vectors[aggr_vectors['year']==year]['vector']
    wiki_bench_y=wiki_bench_per_sector[aggr_vectors['year']==year]['cos_sim']
    cos_sims_global_sec[i_y]['all']=np.zeros(len(vecs_y))
    cos_sims_global_sec[i_y]['p_val']=np.zeros(len(vecs_y))
    for i_vec, vec in enumerate(vecs_y):
        _cos_sim=float(util.cos_sim(goalvecs[-1], vec))
        cos_sims_global_sec[i_y]['all'][i_vec]=_cos_sim
        cos_sims_global_sec[i_y]['p_val'][i_vec]=np.sum(wiki_bench_y[i_vec]>=_cos_sim)/len(wikivecs)

  0%|          | 0/9 [00:00<?, ?it/s]

In [65]:
for i_y, year in enumerate(tqdm(years)):
    _fdr_th=fdr_th(cos_sims_global_sec[i_y]['p_val'], alpha)
    sec_y=aggr_vectors[aggr_vectors['year']==year]['sector']
    validated_sec=sec_y[cos_sims_global_sec[i_y]['p_val']<=_fdr_th]
    print(f'{year:}) len(validated_sectors)={len(validated_sec):}')
    if len(validated_sec)>0:
        for i_vs, vs in enumerate(validated_sec):
            print(f'\t\t{vs:25} sector={sec_helper[vs]:}')

  0%|          | 0/9 [00:00<?, ?it/s]

2015) len(validated_sectors)=0
2016) len(validated_sectors)=0
2017) len(validated_sectors)=0
2018) len(validated_sectors)=1
		05                        sector=Financials
2019) len(validated_sectors)=0
2020) len(validated_sectors)=0
2021) len(validated_sectors)=0
2022) len(validated_sectors)=2
		05                        sector=Financials
		07                        sector=Industrials
2023) len(validated_sectors)=0


## Clustered SDGs

### Define cSDGs

In [66]:
social_sdgs=['01','02','03','04','05','10','16','17']
economic_sdgs=['07','08','09','11','12']
environmental_sdgs=['06','13','14','15']

In [67]:
social_vec=[np.genfromtxt(ABTT1_VECTOR_FOLDER+goalvecfile) for goalvecfile in goalvecfiles if any([sdg in goalvecfile for sdg in social_sdgs])]
social_vec=np.mean(social_vec, axis=0)

In [68]:
economic_vec=[np.genfromtxt(ABTT1_VECTOR_FOLDER+goalvecfile) for goalvecfile in goalvecfiles if any([sdg in goalvecfile for sdg in economic_sdgs])]
economic_vec=np.mean(economic_vec, axis=0)

In [69]:
environmental_vec=[np.genfromtxt(ABTT1_VECTOR_FOLDER+goalvecfile) for goalvecfile in goalvecfiles if any([sdg in goalvecfile for sdg in environmental_sdgs])]
environmental_vec=np.mean(environmental_vec, axis=0)

In [70]:
clustered_sdgs=np.array([social_vec, economic_vec, environmental_vec])

In [71]:
clustered_sdgs.shape

(3, 768)

In [72]:
cSDGS_names=['social', 'economic', 'environmental']

In [73]:
l_csdgs=len(cSDGS_names)

### Cosine similarity validation

In [74]:
cos_sims_cSDGs=np.zeros(len(years)*len(cSDGS_names), dtype=[('cSDG', 'U20'), ('mean', 'f8'), ('all', object), ('p_val', object)])

In [75]:
for i_y, year in enumerate(tqdm(years)):
    vecs_y=vectors[vectors['year']==year]['vector']
    wiki_bench_y=wiki_bench[vectors['year']==year]['cos_sim']
    for i_c, csdg in enumerate(cSDGS_names):
        cos_sims_cSDGs[i_y*l_csdgs+i_c]['cSDG']=csdg
        cos_sims_cSDGs[i_y*l_csdgs+i_c]['all']=np.zeros(len(vecs_y))
        cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val']=np.zeros(len(vecs_y))
        for i_vec, vec in enumerate(vecs_y):
            _cos_sim=float(util.cos_sim(clustered_sdgs[i_c], vec))
            cos_sims_cSDGs[i_y*l_csdgs+i_c]['all'][i_vec]=_cos_sim
            cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val'][i_vec]=np.sum(wiki_bench_y[i_vec]>=_cos_sim)/len(wikivecs)
        cos_sims_cSDGs[i_y*l_csdgs+i_c]['mean']=np.mean(cos_sims_cSDGs[i_y*l_csdgs+i_c]['all'])

  0%|          | 0/9 [00:00<?, ?it/s]

In [76]:
for i_y, year in enumerate(tqdm(years)):
    print(f'\n{year:}')
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    for i_c, csdg in enumerate(cSDGS_names):
        _fdr_th=fdr_th(cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val'], alpha)
        validation_mask=cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val']<=_fdr_th
        validated_compa=compa_y[validation_mask]
        validated_sec=sec_y[validation_mask]
        if len(validated_compa)>0:
            print(f'\033[1m{csdg.capitalize():20}\033[0m len(validated_companies)={len(validated_compa):}')
            for i_vc, vc in enumerate(validated_compa):
                print(f'\t\033[1m{vc:25}\033[0m sector=\033[1m{sec_helper[validated_sec[i_vc]]:}\033[0m')
            print('\n')
        else:
            print(f'{csdg.capitalize():20} len(validated_companies)={len(validated_compa):}')

  0%|          | 0/9 [00:00<?, ?it/s]


2015
[1mSocial              [0m len(validated_companies)=3
	[1munilever                 [0m sector=[1mConsumer Staples[0m
	[1mstandard_chartered       [0m sector=[1mFinancials[0m
	[1mgsk                      [0m sector=[1mHealth Care[0m


Economic             len(validated_companies)=0
Environmental        len(validated_companies)=0

2016
[1mSocial              [0m len(validated_companies)=2
	[1munilever                 [0m sector=[1mConsumer Staples[0m
	[1mstandard_chartered       [0m sector=[1mFinancials[0m


[1mEconomic            [0m len(validated_companies)=2
	[1mstandard_chartered       [0m sector=[1mFinancials[0m
	[1mhalma                    [0m sector=[1mIndustrials[0m


Environmental        len(validated_companies)=0

2017
Social               len(validated_companies)=0
[1mEconomic            [0m len(validated_companies)=1
	[1mhalma                    [0m sector=[1mIndustrials[0m


Environmental        len(validated_companies)=0

2018
S

### Table for $\LaTeX$

In [77]:
_table=[]
for i_y, year in enumerate(tqdm(years)):
    print(f'\n{year:}')
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    for i_c, csdg in enumerate(cSDGS_names):
        _fdr_th=fdr_th(cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val'], alpha)
        validation_mask=cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val']<=_fdr_th
        validated_compa=compa_y[validation_mask]
        validated_sec=sec_y[validation_mask]
        if len(validated_compa)>0:
            print(f'\033[1m{csdg.capitalize():20}\033[0m len(validated_companies)={len(validated_compa):}')
            for i_vc, vc in enumerate(validated_compa):
                company_name_smooth=vc.split('_')
                company_name_smooth=[_.capitalize() for _ in company_name_smooth]
                company_name_smooth=' '.join(company_name_smooth)
                if i_vc==0 and i_c==0:
                    _table.append([year, csdg.capitalize(), company_name_smooth, sec_helper[validated_sec[i_vc]]])
                elif i_vc==0:
                    _table.append(['', csdg.capitalize(), company_name_smooth, sec_helper[validated_sec[i_vc]]])
                else:
                    _table.append(['', '', company_name_smooth, sec_helper[validated_sec[i_vc]]])
        elif i_c==0:
            _table.append([year, csdg.capitalize(), '', ''])
        else:
            _table.append(['', csdg.capitalize(), '', ''])

  0%|          | 0/9 [00:00<?, ?it/s]


2015
[1mSocial              [0m len(validated_companies)=3

2016
[1mSocial              [0m len(validated_companies)=2
[1mEconomic            [0m len(validated_companies)=2

2017
[1mEconomic            [0m len(validated_companies)=1

2018

2019

2020
[1mSocial              [0m len(validated_companies)=6

2021
[1mSocial              [0m len(validated_companies)=2
[1mEconomic            [0m len(validated_companies)=1
[1mEnvironmental       [0m len(validated_companies)=3

2022
[1mEnvironmental       [0m len(validated_companies)=6

2023
[1mEnvironmental       [0m len(validated_companies)=8


In [78]:
header=['Year',"SDGs' dimension",'Firm', 'Sector']

In [79]:
print(tabulate(_table, header, tablefmt="latex"))

\begin{tabular}{llll}
\hline
 Year   & SDGs' dimension   & Firm                          & Sector                 \\
\hline
 2015   & Social            & Unilever                      & Consumer Staples       \\
        &                   & Standard Chartered            & Financials             \\
        &                   & Gsk                           & Health Care            \\
        & Economic          &                               &                        \\
        & Environmental     &                               &                        \\
 2016   & Social            & Unilever                      & Consumer Staples       \\
        &                   & Standard Chartered            & Financials             \\
        & Economic          & Standard Chartered            & Financials             \\
        &                   & Halma                         & Industrials            \\
        & Environmental     &                               &                       

## Clustered SDGs, per sector

### Cosine similarity validation

In [80]:
cos_sims_cSDGs_sec=np.zeros(len(years)*len(cSDGS_names), dtype=[('cSDG', 'U20'), ('all', object), ('p_val', object)])

In [81]:
for i_y, year in enumerate(tqdm(years)):
    vecs_y=aggr_vectors[aggr_vectors['year']==year]['vector']
    wiki_bench_y=wiki_bench_per_sector[aggr_vectors['year']==year]['cos_sim']
    for i_c, csdg in enumerate(cSDGS_names):
        cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['cSDG']=csdg
        cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['all']=np.zeros(len(vecs_y))
        cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['p_val']=np.zeros(len(vecs_y))
        for i_vec, vec in enumerate(vecs_y):
            _cos_sim=float(util.cos_sim(clustered_sdgs[i_c], vec))
            cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['all'][i_vec]=_cos_sim
            cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['p_val'][i_vec]=np.sum(wiki_bench_y[i_vec]>=_cos_sim)/len(wikivecs)

  0%|          | 0/9 [00:00<?, ?it/s]

In [82]:
for i_y, year in enumerate(tqdm(years)):
    print(f'\n{year:}')
    sec_y=aggr_vectors[aggr_vectors['year']==year]['sector']
    for i_c, csdg in enumerate(cSDGS_names):
        _fdr_th=fdr_th(cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['p_val'], alpha)
        validation_mask=cos_sims_cSDGs_sec[i_y*l_csdgs+i_c]['p_val']<=_fdr_th
        validated_sec=sec_y[validation_mask]
        if len(validated_sec)>0:
            print(f'\033[1m{csdg.capitalize():20}')
            for i_vs, vs in enumerate(validated_sec):
                print(f'\t\033[1m{vs:25}\033[0m sector=\033[1m{sec_helper[vs]:}\033[0m')
            print('\n')
        else:
            print(f'{csdg.capitalize():20} len(validated_sec)={len(validated_sec):}')

  0%|          | 0/9 [00:00<?, ?it/s]


2015
[1mSocial              
	[1m05                       [0m sector=[1mFinancials[0m
	[1m06                       [0m sector=[1mHealth Care[0m


Economic             len(validated_sec)=0
Environmental        len(validated_sec)=0

2016
Social               len(validated_sec)=0
Economic             len(validated_sec)=0
Environmental        len(validated_sec)=0

2017
Social               len(validated_sec)=0
Economic             len(validated_sec)=0
Environmental        len(validated_sec)=0

2018
Social               len(validated_sec)=0
[1mEconomic            
	[1m05                       [0m sector=[1mFinancials[0m


Environmental        len(validated_sec)=0

2019
Social               len(validated_sec)=0
Economic             len(validated_sec)=0
Environmental        len(validated_sec)=0

2020
[1mSocial              
	[1m02                       [0m sector=[1mConsumer Discretionary[0m
	[1m05                       [0m sector=[1mFinancials[0m
	[1m06              

### Table for $\LaTeX$

In [83]:
_table=[]
for i_y, year in enumerate(tqdm(years)):
    print(f'\n{year:}')
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    for i_c, csdg in enumerate(cSDGS_names):
        _fdr_th=fdr_th(cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val'], alpha)
        validation_mask=cos_sims_cSDGs[i_y*l_csdgs+i_c]['p_val']<=_fdr_th
        validated_compa=compa_y[validation_mask]
        validated_sec=sec_y[validation_mask]
        if len(validated_compa)>0:
            print(f'\033[1m{csdg.capitalize():20}\033[0m len(validated_companies)={len(validated_compa):}')
            for i_vc, vc in enumerate(validated_compa):
                company_name_smooth=vc.split('_')
                company_name_smooth=[_.capitalize() for _ in company_name_smooth]
                company_name_smooth=' '.join(company_name_smooth)
                if i_vc==0 and i_c==0:
                    _table.append([year, csdg.capitalize(), company_name_smooth, sec_helper[validated_sec[i_vc]]])
                elif i_vc==0:
                    _table.append(['', csdg.capitalize(), company_name_smooth, sec_helper[validated_sec[i_vc]]])
                else:
                    _table.append(['', '', company_name_smooth, sec_helper[validated_sec[i_vc]]])
        elif i_c==0:
            _table.append([year, csdg.capitalize(), '', ''])
        else:
            _table.append(['', csdg.capitalize(), '', ''])

  0%|          | 0/9 [00:00<?, ?it/s]


2015
[1mSocial              [0m len(validated_companies)=3

2016
[1mSocial              [0m len(validated_companies)=2
[1mEconomic            [0m len(validated_companies)=2

2017
[1mEconomic            [0m len(validated_companies)=1

2018

2019

2020
[1mSocial              [0m len(validated_companies)=6

2021
[1mSocial              [0m len(validated_companies)=2
[1mEconomic            [0m len(validated_companies)=1
[1mEnvironmental       [0m len(validated_companies)=3

2022
[1mEnvironmental       [0m len(validated_companies)=6

2023
[1mEnvironmental       [0m len(validated_companies)=8


In [84]:
header=['Year',"SDGs' dimension",'Firm', 'Sector']

In [85]:
print(tabulate(_table, header, tablefmt="latex"))

\begin{tabular}{llll}
\hline
 Year   & SDGs' dimension   & Firm                          & Sector                 \\
\hline
 2015   & Social            & Unilever                      & Consumer Staples       \\
        &                   & Standard Chartered            & Financials             \\
        &                   & Gsk                           & Health Care            \\
        & Economic          &                               &                        \\
        & Environmental     &                               &                        \\
 2016   & Social            & Unilever                      & Consumer Staples       \\
        &                   & Standard Chartered            & Financials             \\
        & Economic          & Standard Chartered            & Financials             \\
        &                   & Halma                         & Industrials            \\
        & Environmental     &                               &                       

## Each SDG

In [86]:
l_sdgs=len(sdg_helper.keys())

In [87]:
cos_sims_SDGs=np.zeros(len(years)*l_sdgs, dtype=[('SDG', 'U20'), ('mean', 'f8'), ('all', object), ('p_val', object)])

In [88]:
for i_y, year in enumerate(tqdm(years)):
    vecs_y=vectors[vectors['year']==year]['vector']
    wiki_bench_y=wiki_bench[vectors['year']==year]['cos_sim']
    for i_s, sdg in enumerate(sdg_helper.keys()):
        cos_sims_SDGs[i_y*l_sdgs+i_s]['SDG']=sdg
        cos_sims_SDGs[i_y*l_sdgs+i_s]['all']=np.zeros(len(vecs_y))
        cos_sims_SDGs[i_y*l_sdgs+i_s]['p_val']=np.zeros(len(vecs_y))
        for i_vec, vec in enumerate(vecs_y):
            _cos_sim=float(util.cos_sim(goalvecs[i_s], vec))
            cos_sims_SDGs[i_y*l_sdgs+i_s]['all'][i_vec]=_cos_sim
            cos_sims_SDGs[i_y*l_sdgs+i_s]['p_val'][i_vec]=np.sum(wiki_bench_y[i_vec]>=_cos_sim)/len(wikivecs)
        cos_sims_SDGs[i_y*l_sdgs+i_s]['mean']=np.mean(cos_sims_SDGs[i_y*l_sdgs+i_s]['all'])

  0%|          | 0/9 [00:00<?, ?it/s]

In [89]:
for i_y, year in enumerate(tqdm(years)):
    print(f'\n\033[1m{year:}\033[0m')
    compa_y=vectors[vectors['year']==year]['company']
    sec_y=vectors[vectors['year']==year]['sector']
    for i_s, sdg in enumerate(sdg_helper.keys()):
        _fdr_th=fdr_th(cos_sims_SDGs[i_y*l_sdgs+i_s]['p_val'], alpha)
        validation_mask=cos_sims_SDGs[i_y*l_sdgs+i_s]['p_val']<=_fdr_th
        validated_compa=compa_y[validation_mask]
        validated_sec=sec_y[validation_mask]
        if len(validated_compa)>0:
            print(f'\033[1m{sdg:}: {sdg_helper[sdg]:60}\033[0m len(validated_companies)={len(validated_compa):}')
            for i_vc, vc in enumerate(validated_compa):
                print(f'\t\033[1m{vc:25}\033[0m sector=\033[1m{sec_helper[validated_sec[i_vc]]:}\033[0m')
            print('\n')
        else:
            print(f'{sdg:}: {sdg_helper[sdg]:60} len(validated_companies)={len(validated_compa):}')

  0%|          | 0/9 [00:00<?, ?it/s]


[1m2015[0m
[1m01: No Poverty                                                  [0m len(validated_companies)=2
	[1munilever                 [0m sector=[1mConsumer Staples[0m
	[1mstandard_chartered       [0m sector=[1mFinancials[0m


[1m02: Zero hunger                                                 [0m len(validated_companies)=1
	[1munilever                 [0m sector=[1mConsumer Staples[0m


[1m03: Good health and well-being                                  [0m len(validated_companies)=2
	[1mastrazeneca              [0m sector=[1mHealth Care[0m
	[1mgsk                      [0m sector=[1mHealth Care[0m


04: Quality education                                            len(validated_companies)=0
[1m05: Gender equality                                             [0m len(validated_companies)=1
	[1munilever                 [0m sector=[1mConsumer Staples[0m


[1m06: Clean water and sanitation                                  [0m len(validated_companies)=3
