# Intro

## Standard modules

In [1]:
import os, sys
import pathlib
import requests

In [58]:
import pickle

In [2]:
import numpy as np

In [3]:
from tqdm.auto import tqdm, trange

In [4]:
from sentence_transformers import util

In [5]:
import matplotlib.pyplot as plt
plt.rcParams['axes.linewidth'] = 2
plt.rcParams['xtick.major.size'] = 10
plt.rcParams['xtick.major.width'] = 2
plt.rcParams['ytick.major.size'] = 10
plt.rcParams['ytick.major.width'] = 2

plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14

plt.rcParams['xtick.minor.size'] = 5
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.minor.size'] = 5
plt.rcParams['ytick.minor.width'] = 1

In [6]:
from matplotlib.ticker import ScalarFormatter # per mettere i ticks in notazione scientifica

## Personal modules

In [7]:
from toolbox.updater import time_is_now

## Folders

In [18]:
VECTOR_FOLDER='./NewProcessedData/vectors/'

In [19]:
BENCHMARK_FOLDER='./NewProcessedData/benchmarks/'

In [20]:
PCA_CENTR_VECTOR_FOLDER='./NewProcessedData/abtt5_vectors/'

In [21]:
ABTT1_VECTOR_FOLDER='./NewProcessedData/abtt1_vectors/'

In [22]:
WIKI_FOLDER='./NewProcessedData/wikipedia/vectors/'

In [23]:
TEXT_FOLDER='./NewProcessedData/texts/'

## Reports' texts

In [24]:
textfiles=os.listdir(TEXT_FOLDER)

In [25]:
textfiles.sort()

In [26]:
# files from 2024 are un reliable, 
# as they probably refer to a different accounting year,
# due to differences with other countries
len(textfiles)

575

In [27]:
textfiles=[textfile for textfile in textfiles if not textfile.startswith('2024')]

In [28]:
len(textfiles)

573

## FDR

In [19]:
def fdr(pvals, alpha):
    l_pvals=len(pvals)
    s_pvals=np.sort(pvals)
    fdr_levels=(1+np.arange(l_pvals))/l_pvals*alpha
    fdr_mask=s_pvals<=fdr_levels
    if np.sum(fdr_mask)>0:
        return fdr_levels[fdr_mask][-1]
    else:
        return 0

## Sector helper

In [20]:
rtf_files=os.listdir('./rtf')
rtf_files.sort()

In [21]:
sec_helper={}
for file in rtf_files:
    splitted_name=file.split('_')
    if splitted_name[0].isnumeric():
        if splitted_name[0] not in sec_helper.keys():
            sec_helper[splitted_name[0].zfill(2)]=splitted_name[1]

In [22]:
sec_helper

{'10': 'Telecommunications',
 '11': 'Utilities',
 '01': 'Basic Materials',
 '02': 'Consumer Discretionary',
 '03': 'Consumer Staples',
 '04': 'Energy',
 '05': 'Financials',
 '06': 'Health Care',
 '07': 'Industrials',
 '08': 'Real Estate',
 '09': 'Technology'}

In [23]:
secs=list(sec_helper.keys())

In [24]:
secs=sorted(secs)

In [25]:
 sec_helper=dict(sorted(sec_helper.items()))

In [26]:
sec_helper

{'01': 'Basic Materials',
 '02': 'Consumer Discretionary',
 '03': 'Consumer Staples',
 '04': 'Energy',
 '05': 'Financials',
 '06': 'Health Care',
 '07': 'Industrials',
 '08': 'Real Estate',
 '09': 'Technology',
 '10': 'Telecommunications',
 '11': 'Utilities'}

### Aggregated sectors

In [27]:
agg_secs={'A': {'name':'Consumer Goods', 'sectors':{'02', '03'}}, 
 'B':{'name':'Essential Infrastructure & Services', 'sectors':{'08', '10', '11'}}, 
 'C':{'name':'Production Sectors','sectors':{'01', '04', '07'}},
 'D':{'name':'Specialized Services','sectors':{'05', '06', '09'}}}

In [28]:
sec2agg_d={'01':'C', '02':'A', '03':'A', '04':'C', '05':'D', '06':'D', '07':'C', '08':'B', '09':'D', '10':'B', '11':'B'}

## SDG helper

In [29]:
sdg_helper={'01':'No Poverty', 
 '02': 'Zero hunger', 
 '03': 'Good health and well-being', 
 '04': 'Quality education',
 '05': 'Gender equality',
 '06': 'Clean water and sanitation',
 '07': 'Affordable and clean energy',
 '08': 'Decent work and economic growth',
 '09': 'Industry, Innovation, Technology and Infrastructure',
 '10': 'Reduced inequality',
 '11': 'Sustainable cities and communities',
 '12': 'Responsible consumption and production',
 '13': 'Climate action',
 '14': 'Life below water',
 '15': 'Life on land',
 '16': 'Peace, justice and strong institutions',
 '17': 'Partnerships for the goals'
}

## Plot standards for sectors

### Disaggregated sectors

In [30]:
_colors=['navy', 'cyan', 'darkturquoise', 'orange', 'magenta', 'darkorchid', 'darkred', 'tomato', 'crimson', 'cadetblue', 'slateblue']

In [31]:
_lss=['-', '--', ':', '-.']

In [32]:
_markers=['^', 's', 'p', 'H', '8', 'X', 'D', '*', 'o']

In [33]:
sec_plots_feat={sec: {'color':_colors[i_s], 'ls':_lss[i_s % len(_lss)], 'marker':_markers[i_s %len(_markers)]} for i_s, sec in enumerate(secs)}

In [34]:
sec_plots_feat

{'01': {'color': 'navy', 'ls': '-', 'marker': '^'},
 '02': {'color': 'cyan', 'ls': '--', 'marker': 's'},
 '03': {'color': 'darkturquoise', 'ls': ':', 'marker': 'p'},
 '04': {'color': 'orange', 'ls': '-.', 'marker': 'H'},
 '05': {'color': 'magenta', 'ls': '-', 'marker': '8'},
 '06': {'color': 'darkorchid', 'ls': '--', 'marker': 'X'},
 '07': {'color': 'darkred', 'ls': ':', 'marker': 'D'},
 '08': {'color': 'tomato', 'ls': '-.', 'marker': '*'},
 '09': {'color': 'crimson', 'ls': '-', 'marker': 'o'},
 '10': {'color': 'cadetblue', 'ls': '--', 'marker': '^'},
 '11': {'color': 'slateblue', 'ls': ':', 'marker': 's'}}

### Aggregated sectors

In [35]:
agg_sec_plots={'A': {'color':'navy', 'marker':'o', 'ls':'-'}, 
               'B': {'color':'darkcyan', 'marker':'*', 'ls':'--'},
               'C': {'color':'magenta', 'marker':'D', 'ls':'-.'},
               'D': {'color':'orange', 'marker':'s', 'ls':':'}}

# Concreteness: a homemade module

Actually, a GPT-made module

In [8]:
from concrete_doc import load_model, text_concreteness, document_concreteness, word_concreteness

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


## Test

In [9]:
# 1. Carica modello con dizionario di Brysbaert
DICT_PATH='./concrete_doc/Brysbaert_et_al_2024.xlsx'
model = load_model(DICT_PATH)

README.md: 0.00B [00:00, ?B/s]

In [11]:
# 2. Singola parola
print(word_concreteness("dog"))       # ~4.99
print(word_concreteness("freedom"))   # ~1.07

4.99332785045438
1.076624942544107


In [12]:
# 3. Frase
print(text_concreteness("The dog chased the ball in the garden."))
print(text_concreteness("Freedom and justice are essential values."))

2.723169907517171
1.1669834035260722


In [14]:
# 4. Lista di testi
docs = [
    "The dog chased the ball in the garden.",
    "Freedom and justice are essential values."
]
print(document_concreteness(docs))

[2.723169907517171, 1.1669834035260722]


In [17]:
# 5. Più frasi
print(text_concreteness("The dog chased the ball in the garden. Its (i.e. the ball) freedom and justice are essential values to me."))

2.0420061233854163


## Let's check it on the entire dataset...

Ok, let's assume it works...

In [59]:
output=np.zeros(len(textfiles), dtype=([('year', int), ('sector', 'U2'), ('company', 'U50'),('concreteness', float)]))

In [None]:
for i_tf, textfile in enumerate(tqdm(textfiles)):
    # get info from the file name
    textfile_splitted=textfile.split('_')
    year=textfile_splitted[0]
    sector=textfile_splitted[1]
    company=textfile.replace(year+'_'+sector+'_', '').replace('_text.txt', '')
    output[i_tf]['year']=year
    output[i_tf]['sector']=sector
    output[i_tf]['company']=company
    with open(TEXT_FOLDER+textfile, 'r') as f:
        _text=f.readline()
    output[i_tf]['concreteness']=text_concreteness(_text)

  0%|          | 0/573 [00:00<?, ?it/s]

In [None]:
with open('./NewProcessedData/concreteness_reports.pickle', 'wb') as f:
    pickle.dump(output, f)

In [None]:
print('Done!')