# BIPARTITE GRAPH

## PREPARING THE ENVIRONMENT

Importing the libraries:

In [1]:
import random
import unittest
import pandas as pd

from pathlib import Path

In [2]:
PARTNERS_PARQUET      =  '../data/parquet/partners.parquet'
BUSINESS_PARQUET      =  '../data/parquet/business.parquet'
SOCIOS_PATH           = [f'../data/csv/socios{i}.csv'           for i in range(10)]
ESTABELECIMENTOS_PATH = [f'../data/csv/estabelecimentos{i}.csv' for i in range(10)]

COLS_PARTNERS     = [0, 2, 5]
COLS_BUSINESS     = [0, 4, 6, 10, 18]

NAMES_PARTNERS = [
    'cnpj'             ,
    'name_partner'     ,
    'partnership_start',
]
NAMES_BUSINESS = [
    'cnpj'        ,
    'trade_name'  ,
    'closing_date',
    'opening_date',
    'cep'         ,
]


In [3]:
partners = pd.read_parquet(PARTNERS_PARQUET)
partners.nunique()

cnpj                 15058103
name_partner         15050736
partnership_start       23528
dtype: int64

In [4]:
len(partners)

25938492

In [3]:
sample = pd.read_parquet(PARTNERS_PARQUET).sample(n=10)

for _, (cnpj, name_partner, partnership_start) in sample.iterrows():
    print(cnpj, name_partner, partnership_start)

61042761 JAIRO SCHNEIDER DE SCHNEIDER 20250528
57994007 CLINICA DE TERAPIA ESPECIAL INTERAGIR LTDA 20241025
3807604 GUSTAVO COELHO DA SILVA 20000509
8728652 ANDRE FABRICIO KUHN PAZINATO 20080418
2234747 RONEISSON DE AQUINO 19971110
17141930 WESLEY JOSE DOS SANTOS REZENDE 20160401
54614134 MONICA JARDIM GONCALVES DE MORAES 20240405
55275183 YOUNG IL HAN 20240524
3769516 EGYDIO MAZINE 20010320
4961211 LEANDRO GARO GOMES 20020226


In [5]:
del sample

In [4]:
business = pd.read_parquet(BUSINESS_PARQUET)

In [5]:
business.describe()


Unnamed: 0,cnpj,closing_date,opening_date,cep
count,66349380.0,66349380.0,66349380.0,66349380.0
mean,32823450.0,20131820.0,20115340.0,45643700.0
std,20618360.0,898877.6,129765.5,30452600.0
min,0.0,0.0,18911020.0,0.0
25%,15530590.0,20140910.0,20060320.0,16200970.0
50%,31759080.0,20200710.0,20160800.0,39460000.0
75%,48373470.0,20230630.0,20210830.0,74917460.0
max,99017780.0,20250620.0,20250620.0,99990970.0


In [7]:
business.nunique()

cnpj            63235728
trade_name      15891863
closing_date       21345
opening_date       24723
cep              1143331
dtype: int64

In [8]:
contagem = business.groupby(['opening_date', 'closing_date']).size()
contagem

opening_date  closing_date
18911023      20051103            1
18930705      20051103            1
18990419      20051103            1
18991016      18991016            1
19010101      0                   1
                              ...  
20250613      20250613        14199
              20250614           42
20250614      20250614         3797
              20250615            5
20250615      20250615          136
Length: 14788388, dtype: int64

In [9]:
duplicatas = contagem[contagem > 1]

# Número total de linhas duplicadas (considerando todas as ocorrências, não só as extras)
qtd_linhas_duplicadas = duplicatas.sum()

In [10]:
qtd_linhas_duplicadas

np.int64(56940425)

In [9]:
business['closing_date'][0]

np.int64(20160817)

In [None]:
def test_partners_parquet_contains_csv_data(self):
        
            row_found = False

            for csv_path in self.SOCIOS_PATH:
                df_csv = pd.read_csv(
                    csv_path,
                    sep=';',
                    usecols=self.COLS_PARTNERS,
                    names=self.NAMES_PARTNERS,
                    encoding='latin-1',
                    on_bad_lines='skip'
                ).dropna()

                df_csv.cnpj = df_csv.cnpj.astype('int32')
                df_csv.partnership_start = pd.to_datetime(
                    df_csv.partnership_start,
                    format='%Y%m%d',
                    errors='coerce'
                )

                match = df_csv[
                    (df_csv.cnpj              == row.cnpj) &
                    (df_csv.name_partner      == row.name_partner) &
                    (df_csv.partnership_start == row.partnership_start)
                ]

                if not match.empty:
                    print("\n✅ Match found!")
                    print(f"📁 In file: socios{i}.csv")
                    print("🔎 Row from parquet:")
                    print(row.to_frame().T.to_string(index=False))
                    print("📄 Matching row from CSV:")
                    print(match.iloc[0].to_frame().T.to_string(index=False))
                    row_found = True
                    break

            if not row_found:
                self.fail(
                    f"\n❌ Row from parquet not found in any socios[0-9].csv file:\n{row.to_frame().T.to_string(index=False)}"
                )




if __name__ == '__main__':
    unittest.main()


In [1]:
import pickle
import pandas as pd

from collections import defaultdict

Loading the data:

In [2]:
partners = pd.read_parquet('../data/parquet/partners.parquet')

partners.head()

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09


In [4]:
partners.head(20)

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09
5,92859677,MARIA IVONETE SANTIAGO STRASSBURGER,1989-06-09
6,4979683,LELIANA ABBUD MARCONDES,2002-03-27
7,4979683,MARIA APARECIDA ABBUD,2004-11-03
8,4979687,ANTONIO CASSIMIRO SOBRINHO,2002-02-25
9,4979687,WANDERLEI NOGUEIRA CASSIMIRO,2002-02-25


In [3]:
len(partners)

25938492

## BIPARTITE GRAPH

Generating the bipartite graph:

In [3]:
cnpj2id    = {}
partner2id = {}
adjdict    = defaultdict(list)

cnpj_id  = 0
cnpj_idx = 0
partner_id  = 0
partner_idx = 0

for idx, (cnpj, partner) in enumerate(partners.itertuples(index=False)):
    if cnpj not in cnpj2id:
        cnpj_idx  = cnpj_id
        cnpj_id  += 1

        cnpj2id[cnpj] = cnpj_idx
    else:
        cnpj_idx = cnpj2id[cnpj]

    if partner not in partner2id:
        partner_idx  = partner_id
        partner_id  += 1

        partner2id[partner] = partner_idx
    else:
        partner_idx = partner2id[partner]

    adjdict[cnpj_idx].append(partner_idx)

    if idx % 1000000 == 999999:
        print('Number of rows processed =', idx + 1)

Number of rows processed = 1000000
Number of rows processed = 2000000
Number of rows processed = 3000000
Number of rows processed = 4000000
Number of rows processed = 5000000
Number of rows processed = 6000000
Number of rows processed = 7000000
Number of rows processed = 8000000
Number of rows processed = 9000000
Number of rows processed = 10000000
Number of rows processed = 11000000
Number of rows processed = 12000000
Number of rows processed = 13000000
Number of rows processed = 14000000
Number of rows processed = 15000000
Number of rows processed = 16000000
Number of rows processed = 17000000
Number of rows processed = 18000000
Number of rows processed = 19000000
Number of rows processed = 20000000
Number of rows processed = 21000000
Number of rows processed = 22000000
Number of rows processed = 23000000
Number of rows processed = 24000000
Number of rows processed = 25000000


Optimizing the data structure of the adjacency list:

In [4]:
adjacency = [[]] * len(adjdict)

for k, v in adjdict.items():
    adjacency[k] = v

del adjdict

Saving the data:

In [5]:
with open('../data/cnpj2id.pkl'   , 'wb') as f:
    pickle.dump(cnpj2id   , f)

with open('../data/partner2id.pkl', 'wb') as f:
    pickle.dump(partner2id, f)

with open('../data/adjacency.pkl' , 'wb') as f:
    pickle.dump(adjacency , f)

Freeing the memory of objects:

In [6]:
del cnpj2id
del partner2id
del adjacency

## EXPLORING THE RESULTS

Checking the number of companies and partners:

In [7]:
print('Number of companies in the \'partners\' file =', partners.cnpj.nunique())
print('Number of partners  in the \'partners\' file =', partners.name_partner.nunique())

Number of companies in the 'partners' file = 14441622
Number of partners  in the 'partners' file = 14686766


Freeing up resources:

In [8]:
del partners

Loading the data:

In [9]:
with open('../data/cnpj2id.pkl'   , 'rb') as f:
    cnpj2id = pickle.load(f)

with open('../data/partner2id.pkl', 'rb') as f:
    partner2id = pickle.load(f)

with open('../data/adjacency.pkl' , 'rb') as f:
    adjacency = pickle.load(f)

Confirming the number of companies and partners:

In [10]:
print('Number of companies in the \'cnpj2id\'    file =', len(cnpj2id   ))
print('Number of partners  in the \'partner2id\' file =', len(partner2id))

Number of companies in the 'cnpj2id'    file = 14441622
Number of partners  in the 'partner2id' file = 14686766


Obtaining the partners of a given company:

In [11]:
CNPJ = 85061877


idx      = cnpj2id[CNPJ]
partners = set(adjacency[idx])

print(f'CNPJ members {CNPJ}:')
print()
for nome, idx in partner2id.items():
    if idx in partners:
        partners.remove(idx)

        print(nome)

    if not partners:
        break

CNPJ members 85061877:

RICARDO BAULE ROSSI
EDUARDO GUILHERME BAULE ROSSI
