# Intro

## Standard modules

In [1]:
import os, sys, json
import numpy as np

In [2]:
from tqdm.auto import tqdm, trange

In [3]:
from bicm import BipartiteGraph as BiG

## Hand-made modules

In [4]:
from melt import melt

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sarawalk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarawalk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Folders

In [5]:
TEXT_FOLDER='./NewProcessedData/texts/'

## Files

In [6]:
text_files=[file for file in os.listdir(TEXT_FOLDER) if file.endswith('.txt')]

In [7]:
text_files.sort()

In [8]:
text_files[0][:4]

'2015'

# Grab all texts

In [10]:
all_texts={}
for text_file in tqdm(text_files):
    year=text_file[:4]
    with open(TEXT_FOLDER+text_file, 'r') as f:
        _text=f.readlines()
    
    if len(_text)>1:
        _text=' '.join(_text)
    elif len(_text)==1:
        _text=_text[0]
    else:
        print(text_file)
    
    if len(_text)>0:
        if year not in all_texts.keys():
            all_texts[year]={}
            all_texts[year]['firms']=[]
            all_texts[year]['texts']=[]
        all_texts[year]['texts'].append(_text)
        all_texts[year]['firms'].append(text_file[8:-9])

  0%|          | 0/574 [00:00<?, ?it/s]

To be checked

# Melt me!

## Binary

In [11]:
all_texts.keys()

dict_keys(['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024'])

In [12]:
[(key, len(all_texts[key]['firms'])) for key in all_texts.keys()]

[('2015', 40),
 ('2016', 44),
 ('2017', 51),
 ('2018', 56),
 ('2019', 67),
 ('2020', 69),
 ('2021', 76),
 ('2022', 80),
 ('2023', 89),
 ('2024', 2)]

**BINARY**

In [13]:
cacca=melt(all_texts['2023']['texts'], binary=True)

  0%|          | 0/89 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [13]:
cacca.get_projection(rows=True, alpha=0.05, approx_method='poisson', threads_num=4, progress_bar=True)


                       of the opposite layer. This may cause some convergence issues.
                      Please use the full mode providing a biadjacency matrix or an edgelist,
                       or clean your data from these nodes. 
                      


  step_fun = args[0]
  arg_step_fun = args[1]


max rows error = 4.547473508864641e-12
max columns error = 9.947598300641403e-14
total error = 5.441136430306415e-11
Solver converged.


  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89/89 [00:09<00:00,  9.79it/s]


No V-motifs will be validated. Try increasing alpha


AttributeError: 'BipartiteGraph' object has no attribute 'cols_projection'

In [14]:
cacca.get_projection(rows=False, alpha=0.05, approx_method='poisson', threads_num=4, progress_bar=True)

  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
  probs = node_xy * neighbor_xy / ((1 + node_xy) * (1 + neighbor_xy))
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 31060/31060 [1:33:21<00:00,  5.55it/s]


No V-motifs will be validated. Try increasing alpha


AttributeError: 'BipartiteGraph' object has no attribute 'cols_projection'

In [None]:
cacca.token_proj

Not only it doesn't work, but it also takes a lot of time.

## Weighted

bicm reads only weighted biadjacency matrices, therefore...

### Function

In [144]:
# biadjacency list to biadjacency matrix
def bili2bima(bili):
    # get the number of different tokens
    all_tokens=[]
    for key in tqdm(bili.keys(), leave=True):
        for _token in bili[key].keys():
            if _token not in all_tokens:
                all_tokens.append(_token)
    # transform all_tokens to a numpy array, such that I can use np.unique safely
    all_tokens=np.array(all_tokens)
    # define the biadjacency matrix 
    bima=np.zeros((len(bili.keys()), len(all_tokens)), dtype=int)
    for key in tqdm(bili.keys(), leave=True):
        for _token in bili[key].keys():
            where_token=np.where(all_tokens==_token)[0][0]
            bima[key, where_token]+=bili[key][token]
    return bima, all_tokens

### Debug

In [24]:
len(all_texts['2023']['texts'])

89

In [14]:
cacca=melt(all_texts['2023']['texts'], binary=False)

  0%|          | 0/89 [00:00<?, ?it/s]

In [162]:
len(cacca.biadj_list)

89

In [29]:
aux, aux_at=bili2bima(cacca.biadj_list)

  0%|          | 0/89 [00:00<?, ?it/s]

  0%|          | 0/89 [00:00<?, ?it/s]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [30]:
bili=cacca.biadj_list.copy()

In [101]:
bad_char=['-', '©', '–', '‘', '’', '“', '”', "''", "'s",'``', '\\', '|']

In [110]:
    all_tokens=[]
    for key in tqdm(bili.keys(), leave=True):
        for _token in bili[key].keys():
            if _token not in all_tokens and all([bd not in _token for bd in bad_char]) and _token[0] not in ["'", "\\", "/", '+', "^^", ":", '£', '$'] and not _token[0].isnumeric():
                all_tokens.append(_token)
    all_tokens=np.array(all_tokens)

  0%|          | 0/89 [00:00<?, ?it/s]

In [111]:
    bima=np.zeros((len(bili.keys()), len(all_tokens)), dtype=int)
    for key in tqdm(bili.keys(), leave=True):
        for _token in bili[key].keys():
            if _token in all_tokens:
                where_token=np.where(all_tokens==_token)[0][0]
                bima[key, where_token]+=bili[key][_token]
    

  0%|          | 0/89 [00:00<?, ?it/s]

In [113]:
cacca=BiG()

In [114]:
cacca.set_biadjacency_matrix(bima)

Discrete weighted model: BiWCM_d


In [115]:
cacca.solve_tool()

  step_fun = args[0]
  arg_step_fun = args[1]


max rows error = 0.00010472287903845316
max columns error = 7.311720381864007e-06
total error = 0.00773124083942919
Solver converged.


In [116]:
cacca.compute_weighted_pvals_mat()

In [117]:
cacca.pvals_mat

array([[1.45918023e-01, 4.66682797e-05, 3.74940711e-02, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 2.41590737e-03, 3.69701886e-02, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.18151329e-02, 5.45686665e-04, 1.18151329e-02]])

In [119]:
pval_m=cacca.get_validated_matrix(significance=0.05, validation_method='fdr')

In [126]:
for i in range(len(pval_m)):
    print(all_tokens[np.where(pval_m[i]==1)[0]], all_texts['2023']['firms'][i])

['africa' 'alto' 'american' 'anglo' 'barro' 'beer' 'berlin' 'botswana'
 'bronc' 'capcoal' 'carcinogen' 'chile' 'closur' 'copper' 'cpr' 'crd'
 'cybersecur' 'dam' 'dewat' 'diamond' 'edna' 'el' 'envusa' 'fcev'
 'futuresmart' 'gbv' 'gistm' 'hds' 'heritag' 'hiv' 'host' 'ibi' 'icmm'
 'irma' 'iron' 'kolomela' 'kumba' 'learn+' 'learner' 'los' 'mina' 'mine'
 'miner' 'mining™' 'mitsubishi' 'mogalakwena' 'moquegua' 'mt' 'nickel'
 'nois' 'npi' 'nutrient' 'oel' 'ore' 'peru' 'pgms' 'pionero' 'pmlu'
 'polyhalit' 'psm' 'queensland' 'quellaveco' 'rehabilit' 'rescu' 'resettl'
 'rjc' 'rock' 'rvms' 'sishen' 'slp' 'smp' 'soldado' 'south' 'southern'
 'spatial' 'steelmak' 'tail' 'teacher' 'tsfs' 'underground' 'unki'
 'valutrax™' 'vam' 'vfl' 'violenc' 'woodsmith' 'yourvoic' 'zimbabw'
 'zimel'] ANGLO_AMERICAN_PLC
['impact' 'ingredi' 'sustain' 'croda'] CRODA_INTERNATIONAL_PLC
['endeavour' 'gold' 'hectar' 'malaria' 'mine' 'pit' 'reforest' 'rehabilit'
 'resettl' 'tsf' 'villag' 'asgm' 'boungou' 'burkina' 'cyanid' 

In [124]:
all_texts['2023']['firms'][0]

'ANGLO_AMERICAN_PLC'

In [128]:
np.where(all_tokens=='sdg')

(array([2737]),)

In [133]:
all_tokens[2737:2740]

array(['sdg', 'sdg16', 'sdgs'], dtype='<U90')

In [138]:
np.sum(pval_m, axis=0)[2737:2740]

array([1, 0, 0], dtype=uint64)

In [141]:
all_texts['2023']['firms'][np.where(pval_m[:, 2737]==1)[0][0]]

'RELX_PLC'

In [143]:
np.any(np.sum(pval_m, axis=1)==0)

False

### Maximum Entropy TOpic DEtection (METODE)

In [19]:
import string

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sarawalk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarawalk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
class metode:
    
    
    def __init__(self, texts, row_names=None, alpha=0.01, lang=None):
        # biadjacency list
        self.texts=texts
        # row names
        if row_names is not None:
            self.row_names=row_names
        else:
            self.row_names=np.arange(len(row_names))
        # significance threshold
        self.alpha=alpha
        assert alpha<1 and alpha>0
        # language
        if lang is None:
            self.lang="english"
        else:
            # check that english is among the accepted languages by nltk
            self.lang=lang
        # get the stemmer
        self.stemmer = SnowballStemmer(self.lang, ignore_stopwords=True)
        
        
        # get the biadjacency list
        self.get_bili()
        # get the biadjacency matrix to feed bicm
        self.get_all_tokens()
        self.bili2bima()
    
    def _tests(self, entry):
        stop_words = list(stopwords.words(self.lang))
        bad_char=['©', '–', '‘', '’', '“', '”', "''", "'s",'``']
        super_bad_char=["'", "\\", "/", '+', "^^", ":", '£', '$']
        # I am removing:
        # - stop words;
        # - punctuation
        # - fractional numbers
        _test_0=not (entry in stop_words)
        _test_1=not (entry in bad_char)
        _test_2=not (entry in string.punctuation)
        _test_3=not ('.' in entry)
        _test_4=not (',' in entry)
        _test_5=not entry.isnumeric()
        #_test_5=not entry[0].isnumeric()
        #_test_6=not (entry[0] in super_bad_char)
        return _test_0 and _test_1 and _test_2 and _test_3 and _test_4 and _test_5# and _test_6
    
    def get_bili(self):
        self.bili={}
        for i in trange(len(self.texts), leave=True, desc='get biadjacency list'):
            self.bili[self.row_names[i]]=self.text2tokens(self.texts[i])

    
    def text2tokens(self, text):
        word_tokens = [wt.lower() for wt in word_tokenize(text)]
        out=[]
        for w in word_tokens:
            if self._tests(w):
                out.append(self.stemmer.stem(w))
        # calculate the multiplicity of entries in out
        out=np.array(out)
        aux=np.unique(out, return_counts=True)
        return dict(zip(aux[0], aux[1]))
    
    def get_all_tokens(self):
        self.all_tokens=[]
        for key in tqdm(self.bili.keys(), leave=True, desc='get all tokens'):
            for _token in self.bili[key].keys():
                self.all_tokens.append(_token)
        self.all_tokens=np.array(all_tokens)

    def bili2bima(self):
        ''' 
        biadjacency list to biadjacency matrix
        '''
        self.bima=np.zeros((len(self.bili.keys()), len(self.all_tokens)), dtype=int)
        for key in tqdm(self.bili.keys(), leave=True):
            for _token in self.bili[key].keys():
                where_token=np.where(self.all_tokens==_token)[0][0]
                self.bima[key, where_token]+=self.bili[key][token]
                
    def metode(self):
        '''
        Maximum Entropy TOpic DEtection
        '''
        # Bipartite Graph of bicm
        self.mygraph=BiG()
        # initialize it with the bipartite matrix, 
        # i.e. the only way to initialize the method for bipartite weighted graphs
        self.mygraph.set_biadjacency_matrix(self.bima)
        # solve BiWCM
        self.mygraph.solve_tool()
        # calculate p-values
        self.mygraph.compute_weighted_pvals_mat()
        self.pval_m=cacca.get_validated_matrix(significance=self.alpha, validation_method='fdr')

In [21]:
debug=metode(all_texts['2023']['texts'], all_texts['2023']['firms'], 0.05)

get biadjacency list:   0%|          | 0/89 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Check

In [15]:
from metode import metode as mymetode

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sarawalk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/sarawalk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
debug0=mymetode(all_texts['2023']['texts'], all_texts['2023']['firms'], 0.05)

get biadjacency list:   0%|          | 0/89 [00:00<?, ?it/s]

get all tokens:   0%|          | 0/89 [00:00<?, ?it/s]

NameError: name 'all_tokens' is not defined