# BIPARTITE GRAPH

## PREPARING THE ENVIRONMENT

Importing the libraries:

In [1]:
import pickle
import pandas as pd

from collections import defaultdict

Loading the data:

In [2]:
partners = pd.read_parquet('../data/parquet/partners.parquet')

partners.head()

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09


In [4]:
partners.head(20)

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09
5,92859677,MARIA IVONETE SANTIAGO STRASSBURGER,1989-06-09
6,4979683,LELIANA ABBUD MARCONDES,2002-03-27
7,4979683,MARIA APARECIDA ABBUD,2004-11-03
8,4979687,ANTONIO CASSIMIRO SOBRINHO,2002-02-25
9,4979687,WANDERLEI NOGUEIRA CASSIMIRO,2002-02-25


In [3]:
len(partners)

25938492

## BIPARTITE GRAPH

Generating the bipartite graph:

In [3]:
cnpj2id    = {}
partner2id = {}
adjdict    = defaultdict(list)

cnpj_id  = 0
cnpj_idx = 0
partner_id  = 0
partner_idx = 0

for idx, (cnpj, partner) in enumerate(partners.itertuples(index=False)):
    if cnpj not in cnpj2id:
        cnpj_idx  = cnpj_id
        cnpj_id  += 1

        cnpj2id[cnpj] = cnpj_idx
    else:
        cnpj_idx = cnpj2id[cnpj]

    if partner not in partner2id:
        partner_idx  = partner_id
        partner_id  += 1

        partner2id[partner] = partner_idx
    else:
        partner_idx = partner2id[partner]

    adjdict[cnpj_idx].append(partner_idx)

    if idx % 1000000 == 999999:
        print('Number of rows processed =', idx + 1)

Number of rows processed = 1000000
Number of rows processed = 2000000
Number of rows processed = 3000000
Number of rows processed = 4000000
Number of rows processed = 5000000
Number of rows processed = 6000000
Number of rows processed = 7000000
Number of rows processed = 8000000
Number of rows processed = 9000000
Number of rows processed = 10000000
Number of rows processed = 11000000
Number of rows processed = 12000000
Number of rows processed = 13000000
Number of rows processed = 14000000
Number of rows processed = 15000000
Number of rows processed = 16000000
Number of rows processed = 17000000
Number of rows processed = 18000000
Number of rows processed = 19000000
Number of rows processed = 20000000
Number of rows processed = 21000000
Number of rows processed = 22000000
Number of rows processed = 23000000
Number of rows processed = 24000000
Number of rows processed = 25000000


Optimizing the data structure of the adjacency list:

In [4]:
adjacency = [[]] * len(adjdict)

for k, v in adjdict.items():
    adjacency[k] = v

del adjdict

Saving the data:

In [5]:
with open('../data/cnpj2id.pkl'   , 'wb') as f:
    pickle.dump(cnpj2id   , f)

with open('../data/partner2id.pkl', 'wb') as f:
    pickle.dump(partner2id, f)

with open('../data/adjacency.pkl' , 'wb') as f:
    pickle.dump(adjacency , f)

Freeing the memory of objects:

In [6]:
del cnpj2id
del partner2id
del adjacency

## EXPLORING THE RESULTS

Checking the number of companies and partners:

In [7]:
print('Number of companies in the \'partners\' file =', partners.cnpj.nunique())
print('Number of partners  in the \'partners\' file =', partners.name_partner.nunique())

Number of companies in the 'partners' file = 14441622
Number of partners  in the 'partners' file = 14686766


Freeing up resources:

In [8]:
del partners

Loading the data:

In [9]:
with open('../data/cnpj2id.pkl'   , 'rb') as f:
    cnpj2id = pickle.load(f)

with open('../data/partner2id.pkl', 'rb') as f:
    partner2id = pickle.load(f)

with open('../data/adjacency.pkl' , 'rb') as f:
    adjacency = pickle.load(f)

Confirming the number of companies and partners:

In [10]:
print('Number of companies in the \'cnpj2id\'    file =', len(cnpj2id   ))
print('Number of partners  in the \'partner2id\' file =', len(partner2id))

Number of companies in the 'cnpj2id'    file = 14441622
Number of partners  in the 'partner2id' file = 14686766


Obtaining the partners of a given company:

In [11]:
CNPJ = 85061877


idx      = cnpj2id[CNPJ]
partners = set(adjacency[idx])

print(f'CNPJ members {CNPJ}:')
print()
for nome, idx in partner2id.items():
    if idx in partners:
        partners.remove(idx)

        print(nome)

    if not partners:
        break

CNPJ members 85061877:

RICARDO BAULE ROSSI
EDUARDO GUILHERME BAULE ROSSI
