# CONNECTED COMPONENT

## PREPARING THE ENVIRONMENT

Importing the libraries:

In [1]:
import pickle

from agnes import DisjointSet
from collections import defaultdict

Loading the data:

In [2]:
with open('../data/cnpj2id.pkl'     , 'rb') as f:
    cnpj2id = pickle.load(f)

with open('../data/partner2id.pkl'  , 'rb') as f:
    partner2id = pickle.load(f)

with open('../data/adjacency.pkl'   , 'rb') as f:
    adjacency = pickle.load(f)

id2cnpj = {v : k for k, v in cnpj2id.items()}

print('Quantidade de cnpjs =', len(cnpj2id))
print('Quantidade de sócios =', len(partner2id))
print('Quantidade de adjacencias =', len(adjacency))

Quantidade de cnpjs = 14441622
Quantidade de sócios = 14686766
Quantidade de adjacencias = 14441622


## CONNECTED COMPONENTS

Separating the partners into disjoint joints based on their relationship in the corporate structure of companies:

In [3]:
l = len(partner2id)

print('Number of patners =', l)

Number of patners = 14686766


In [4]:
fu = DisjointSet(l)

print('Testing the first 10 disjoint sets:')
print()
for i in range(10):
    print(f'Vertex {i} = {fu[i]}')

Testing the first 10 disjoint sets:

Vertex 0 = 0
Vertex 1 = 1
Vertex 2 = 2
Vertex 3 = 3
Vertex 4 = 4
Vertex 5 = 5
Vertex 6 = 6
Vertex 7 = 7
Vertex 8 = 8
Vertex 9 = 9


In [5]:
for u, *adj in adjacency:
    for v in adj:
        fu.union(u, v)

print('Testing the first 10 disjoint sets:')
print()
for i in range(10):
    print(f'Vertex {i} = {fu[i]}')

Testing the first 10 disjoint sets:

Vertex 0 = 0
Vertex 1 = 0
Vertex 2 = 3985564
Vertex 3 = 3985564
Vertex 4 = 3228018
Vertex 5 = 10511265
Vertex 6 = 10511265
Vertex 7 = 5116955
Vertex 8 = 5116955
Vertex 9 = 11760964


Getting the connected components based on the disjoint sets of resources:

> **Note**.: Ignoring connected components with a single vertex in the first independent set.

In [6]:
ds = defaultdict(list)

for idx, (cc, *_) in enumerate(adjacency):
    ds[fu[cc]].append(idx)

components = [companies
              for companies in ds.values()
              if len(companies) > 1]

print('Number of connected components =', len(ds))

print()

print('Number of basic CNPJs =', sum(map(len, components)))
print('Number with more than one vertex in the subgraph =', len(components))

Number of connected components = 6258661

Number of basic CNPJs = 9759398
Number with more than one vertex in the subgraph = 1576437


Mapping cnpj to connected component and position in the component:

In [7]:
del cnpj2id
del partner2id
del adjacency

In [8]:
cnpj2comp = dict()

for idxcomp, comp in enumerate(components):
    for idxemp, emp in enumerate(comp):
        cnpj2comp[id2cnpj[emp]] = (idxcomp, idxemp)

print('Displaying the first five mappings:')
print()
for i, (cnpj, map) in enumerate(cnpj2comp.items()):
    if i == 5:
        break

    print(cnpj, '->', map)

Displaying the first five mappings:

85061877 -> (0, 0)
10739258 -> (0, 1)
28815926 -> (0, 2)
66013244 -> (0, 3)
49252695 -> (0, 4)


In [9]:
del id2cnpj

## SAVING THE DATA

Saving the data:

In [10]:
with open('../data/components.pkl', 'wb') as f:
    pickle.dump(components, f)

del components

with open('../data/cnpj2comp.pkl' , 'wb') as f:
    pickle.dump(cnpj2comp, f)

del cnpj2comp