# BIPARTITE GRAPH

## PREPARING THE ENVIRONMENT

In [1]:
import csv
import random
import unittest
import pandas as pd

from pathlib     import Path
from fastparquet import ParquetFile

In [2]:
PARTNERS_PARQUET = '../data/parquet/partners.parquet'
BUSINESS_PARQUET = '../data/parquet/business.parquet'

In [5]:
def get_sample_from_parquet(filename, n=10):
    samples = []

    with open(filename, mode='rb') as f:
        pf   = ParquetFile(f)
        idxs = sorted(random.sample(range(pf.count()), n))

        current_row = 0
        for rg in pf.iter_row_groups():
            num_rows = len(rg)

            local_indices = [
                i - current_row
                for i in idxs
                if current_row <= i < current_row + num_rows
            ]

            if local_indices:
                samples.append(rg.iloc[local_indices])

            current_row += num_rows
            if current_row > idxs[-1]:
                break

    return pd.concat(samples, ignore_index=True)


sample = get_sample_from_parquet(BUSINESS_PARQUET)
sample

Unnamed: 0,cnpj,trade_name,closing_date,opening_date,cep
0,36040517,,20030407,19940510,29102040
1,18519374,ARKYS CONSULTORIA,20130719,20130719,9270430
2,40226211,,20201230,20201230,38407225
3,18774161,CHEZ MARIE PANIFICADORA,20210308,20130829,77059044
4,43597524,J.V. CENTRO AUTOMOTIVO,20210921,20210921,9890060
5,46770387,POKOBAO PIZZARIA,20230213,20220613,13480002
6,51905771,,20230822,20230822,11454540
7,56118619,,20240728,20240728,32423290
8,57772591,,20241021,20241021,35530000
9,56810538,,20241231,20240815,39530000


In [7]:
def find_row_business(
    files       ,
    cols        ,
    cnpj_col    ,
    closing_col ,
    opening_col ,
    row         ,
):
    for csv_path in files:
        with open(csv_path, encoding="latin-1", newline='') as f:
            reader = csv.reader(f, delimiter=';', quotechar='"')

            for row in reader:
                if not row:
                    continue

                if (
                    int(row[cnpj_col    ]) == row.cnpj         and
                    int(row[closing_col ]) == row.closing_date and
                    int(row[opening_col ]) == row.opening_date
                ):
                    return [row[idx] for idx in cols]

    return None


ESTABELECIMENTOS_COLS = [0, 4, 6, 10, 18]
CNPJ_COL    = 0
CLOSING_COL = 6
OPENING_COL = 10

ESTABELECIMENTOS_PATH = [
    f'../data/csv/estabelecimentos/estabelecimentos{i}.csv'
    for i in range(10)
]


for _, row in sample.iterrows():
    found = find_row_business(
        ESTABELECIMENTOS_PATH,
        ESTABELECIMENTOS_COLS,
        CNPJ_COL             ,
        CLOSING_COL          ,
        OPENING_COL          ,
        row                  ,
    )

    cnpj, trade_name, closing, opening, cep = row
    if found:
        print(
            f"✅ Found match:\n"
            f"Parquet:     {cnpj}, {trade_name}, {closing}, {opening}, {cep}\n"
            f"CSV Match:   {found[0]}, {found[1]}, {found[2]}, {found[3]}, {found[4]}"
        )

AttributeError: 'list' object has no attribute 'cnpj'

In [None]:






    def find_row_partners(
        self    ,
        files   ,
        cols    ,
        cnpj_col,
        name_col,
        target  ,
    ):
        for csv_path in files:
            with open(csv_path, encoding="latin-1", newline='') as f:
                reader = csv.reader(f, delimiter=';', quotechar='"')

                for row in reader:
                    if not row:
                        continue

                    if (
                        int(row[cnpj_col]) == target.cnpj and
                            row[name_col]  == target.name_partner
                    ):
                        return [row[idx] for idx in cols]

        return None





    def test_partners_parquet_contains_csv_data(self):
        CNPJ_COL, NAME_COL, _ = SOCIOS_COLS = [0, 2, 5]
        SOCIOS_PATH = [
            self.ROOT_DIR / f'data/csv/socios/socios{i}.csv'
            for i in range(10)
        ]

        sample = self.get_sample_from_parquet(self.PARTNERS_PARQUET)

        for _, row in sample.iterrows():
            found = self.find_row_partners(
                SOCIOS_PATH,
                SOCIOS_COLS,
                CNPJ_COL   ,
                NAME_COL   ,
                row        ,
            )

            cnpj, name, start_date = row
            with self.subTest(
                cnpj=cnpj       ,
                partner=name    ,
                start=start_date,
            ):
                if found:
                    print(
                        f"✅ Found match:\n"
                        f"Parquet:     {cnpj}, {name}, {start_date}\n"
                        f"CSV Match:   {found[0]}, {found[1]}, {found[2]}"
                    )
                else:
                    self.fail(
                        f"❌ CNPJ {cnpj} from parquet not found in any socios[0-9].csv file."
                    )


    def test_business_parquet_contains_csv_data(self):
        


if __name__ == '__main__':
    unittest.main()


In [1]:
import csv
import random
import unittest
import pandas as pd

from pathlib     import Path
from fastparquet import ParquetFile

In [2]:
PARTNERS_PARQUET = '../data/parquet/partners.parquet'
BUSINESS_PARQUET = '../data/parquet/business.parquet'

In [4]:
def get_sample_from_parquet(filename, n=10):
    samples = []

    with open(filename, mode='rb') as f:
        pf = ParquetFile(f)

        idxs = sorted(random.sample(range(pf.count()), n))

        current_row = 0
        for rg in pf.iter_row_groups():
            num_rows = len(rg)

            local_indices = [
                i - current_row
                for i in idxs
                if current_row <= i < current_row + num_rows
            ]

            if local_indices:
                samples.append(rg.iloc[local_indices])

            current_row += num_rows
            if current_row > idxs[-1]:
                break

    return pd.concat(samples)

SOCIOS_COLS = [0, 2, 5]
SOCIOS_PATH = [
    f'../data/csv/socios/socios{i}.csv'
    for i in range(10)
]

sample = get_sample_from_parquet(PARTNERS_PARQUET)
sample

Unnamed: 0,cnpj,name_partner,start_date
346870,56819642,JOAO GASPAR,19861127
583620,4269158,MARIA DE JESUS DINIZ,20010129
796173,5777808,CLEIDE DA SILVA LIMA FERNANDES,20030715
523723,7776979,RICARDO ELOI DOS REIS,20051227
996263,8382971,FABIO SOARES REIS,20061004
248040,10374368,JOAO BOSCO PERES LOPES,20080925
691613,27613130,JOAO LUCAS RODRIGUES BANHOS,20200429
48857,38904406,ANDRE DE SOUZA NASCIMENTO,20200925
792047,35364915,ROGERIO DE SOUZA FURTADO,20221013
171222,25477090,PAULA LOYOLA SILVA DORNELLAS,20240927


In [None]:


for _, row in sample.iterrows():
    row_found = find_row(
        SOCIOS_PATH,
        SOCIOS_COLS,
        row
    )

    print(row)
    print(row_found)

cnpj               56819642
name_partner    JOAO GASPAR
start_date         19861127
Name: 346870, dtype: object
['56819642', 'JOAO GASPAR', '19861127']


KeyboardInterrupt: 

In [28]:
row_list = ['56819642', 'JOAO GASPAR', '19861127']
row_list

['56819642', 'JOAO GASPAR', '19861127']

In [30]:
all(str(u) == v for u, v in zip(row, row_list))

True

In [20]:
row

cnpj               56819642
name_partner    JOAO GASPAR
start_date         19861127
Name: 346870, dtype: object

In [15]:
type(row)

pandas.core.series.Series

In [None]:



def test_partners_parquet_contains_csv_data(self):




        with self.subTest(cnpj=row[0], partner=row[1], start=row[2]):
            if row_found:
                print(
                    f"✅ Found match:\nParquet:     {row[0]}, {row[1]}, {row[2]}\nCSV Match:   {row_found[0]}, {row_found[1]}, {row_found[2]}"
                )
            else:
                self.fail(
                    f"❌ CNPJ {row[0]} from parquet not found in any socios[0-9].csv file."
                )

    del sample





if __name__ == '__main__':
    unittest.main()


In [20]:
import numpy as np
from fastparquet import ParquetFile
import random

In [36]:
pf = ParquetFile('../data/parquet/partners.parquet')
nrows = pf.count()
indices = sorted(random.sample(range(nrows), 10))



In [42]:
pf(0)

TypeError: 'ParquetFile' object is not callable

            cnpj                          name_partner  start_date
789393   2060982                          VALDECI JOSE    19970822
307257   3949204                      SIDNEI DEICHMANN    20000717
399650  61198370                KRISTIAN SALERMO MOTTA    20020909
219027   6339764                   LUCIO CORREA JUNIOR    20040616
748718   9300338             NICOLAS JEAN KOURKOUNAKIS    20071207
304142  14239350  SIRLENE SILVERIO PEREIRA CONTESSOTTO    20110825
270320  14078181  JUSSARA OLIVEIRA DA COSTA NASCIMENTO    20130109
418935  20845512      ANTONIA MARIA GONZAGA DE CAMARGO    20140815
569090  27810268            HARYELL DA SILVA GUIMARAES    20210301
190178  59962718       EDUARDO GONCALVES DO NASCIMENTO    20250318


[475077,
 5146456,
 6773234,
 7204000,
 8040775,
 9856221,
 14816320,
 15612844,
 21252996,
 22885277]

Unnamed: 0,cnpj,name_partner,start_date
0,4128860,,19890103
1,6029375,,6357060
2,3801155,,20051128
3,5570652,,20060821
4,6619251,,20071112
5,7536754,,20100616
6,27191464,,20170224
7,7077993,,20180315
8,49691763,,20230223
9,4259932,,20240129


Importing the libraries:

In [29]:
sample = pd.read_parquet(PARTNERS_PARQUET).sample(n=10)

for _, (cnpj, name_partner, partnership_start) in sample.iterrows():
    print(cnpj, name_partner, partnership_start)

27620511 ANA PAULA ABREU DE ALMEIDA 20170427
51074445 MAIRA RAMOS VIEIRA ARAUJO 20230616
42129278 MARCELO ANDRADE GOMES 20210528
3080867 VERA LUCIA SILVA MOSCI 20000811
59114221 SIMONE DA SILVA BISPO 20250127
49613014 SANDRO TEIXEIRA DO NASCIMENTO 20230215
3299489 SERGIO PAULO PEDROSO SILVA 19990719
59837234 ELITIERRY BARBOSA AMORIM RODRIGUES 20250311
50564995 AMANDA DA SILVA MOTA 20230505
5926690 MARCELO GOMES VASCONCELLOS 20060117


In [54]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 14936674 to 6840269
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   cnpj               10 non-null     int32 
 1   name_partner       10 non-null     object
 2   partnership_start  10 non-null     int64 
dtypes: int32(1), int64(1), object(1)
memory usage: 280.0+ bytes


In [30]:
print(cnpj, name_partner, partnership_start)

5926690 MARCELO GOMES VASCONCELLOS 20060117


In [50]:
business = pd.read_parquet(BUSINESS_PARQUET)

In [51]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66349375 entries, 0 to 66349374
Data columns (total 5 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   cnpj          int64 
 1   trade_name    object
 2   closing_date  int64 
 3   opening_date  int64 
 4   cep           int64 
dtypes: int64(4), object(1)
memory usage: 2.5+ GB


In [52]:
business.head()

Unnamed: 0,cnpj,trade_name,closing_date,opening_date,cep
0,4155878,CONSULADO GERAL DA ITALIA,0,19010101,90880481
1,8771954,,0,19010814,55900000
2,5245062,CARTORIO REGISTRO CIVIL,0,19220209,15490000
3,5014516,CARTORIO,0,19220821,75154000
4,7088608,SINAGOGA,0,19291227,3106030


In [47]:
type(name_partner)

str

In [46]:
selected[1]

'ITALO BARRETO OLIVEIRA SANTOS'

In [None]:
def test_partners_parquet_contains_csv_data(self):
        
            row_found = False

            for csv_path in self.SOCIOS_PATH:
                df_csv = pd.read_csv(
                    csv_path,
                    sep=';',
                    usecols=self.COLS_PARTNERS,
                    names=self.NAMES_PARTNERS,
                    encoding='latin-1',
                    on_bad_lines='skip'
                ).dropna()

                df_csv.cnpj = df_csv.cnpj.astype('int32')
                df_csv.partnership_start = pd.to_datetime(
                    df_csv.partnership_start,
                    format='%Y%m%d',
                    errors='coerce'
                )

                match = df_csv[
                    (df_csv.cnpj              == row.cnpj) &
                    (df_csv.name_partner      == row.name_partner) &
                    (df_csv.partnership_start == row.partnership_start)
                ]

                if not match.empty:
                    print("\n✅ Match found!")
                    print(f"📁 In file: socios{i}.csv")
                    print("🔎 Row from parquet:")
                    print(row.to_frame().T.to_string(index=False))
                    print("📄 Matching row from CSV:")
                    print(match.iloc[0].to_frame().T.to_string(index=False))
                    row_found = True
                    break

            if not row_found:
                self.fail(
                    f"\n❌ Row from parquet not found in any socios[0-9].csv file:\n{row.to_frame().T.to_string(index=False)}"
                )




if __name__ == '__main__':
    unittest.main()


In [1]:
import pickle
import pandas as pd

from collections import defaultdict

Loading the data:

In [2]:
partners = pd.read_parquet('../data/parquet/partners.parquet')

partners.head()

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09


In [4]:
partners.head(20)

Unnamed: 0,cnpj,name_partner,partnership_start
0,1685190,JUSSARA PAZIN,1997-02-24
1,1685190,ANTONIO PAZIN,1999-10-06
2,92859388,ARI JOAO MARIA,2005-09-12
3,92859636,JOSE CARLOS LIMA,2005-09-12
4,92859677,EGON ROBERTO STRASSBURGER,1989-06-09
5,92859677,MARIA IVONETE SANTIAGO STRASSBURGER,1989-06-09
6,4979683,LELIANA ABBUD MARCONDES,2002-03-27
7,4979683,MARIA APARECIDA ABBUD,2004-11-03
8,4979687,ANTONIO CASSIMIRO SOBRINHO,2002-02-25
9,4979687,WANDERLEI NOGUEIRA CASSIMIRO,2002-02-25


In [3]:
len(partners)

25938492

## BIPARTITE GRAPH

Generating the bipartite graph:

In [3]:
cnpj2id    = {}
partner2id = {}
adjdict    = defaultdict(list)

cnpj_id  = 0
cnpj_idx = 0
partner_id  = 0
partner_idx = 0

for idx, (cnpj, partner) in enumerate(partners.itertuples(index=False)):
    if cnpj not in cnpj2id:
        cnpj_idx  = cnpj_id
        cnpj_id  += 1

        cnpj2id[cnpj] = cnpj_idx
    else:
        cnpj_idx = cnpj2id[cnpj]

    if partner not in partner2id:
        partner_idx  = partner_id
        partner_id  += 1

        partner2id[partner] = partner_idx
    else:
        partner_idx = partner2id[partner]

    adjdict[cnpj_idx].append(partner_idx)

    if idx % 1000000 == 999999:
        print('Number of rows processed =', idx + 1)

Number of rows processed = 1000000
Number of rows processed = 2000000
Number of rows processed = 3000000
Number of rows processed = 4000000
Number of rows processed = 5000000
Number of rows processed = 6000000
Number of rows processed = 7000000
Number of rows processed = 8000000
Number of rows processed = 9000000
Number of rows processed = 10000000
Number of rows processed = 11000000
Number of rows processed = 12000000
Number of rows processed = 13000000
Number of rows processed = 14000000
Number of rows processed = 15000000
Number of rows processed = 16000000
Number of rows processed = 17000000
Number of rows processed = 18000000
Number of rows processed = 19000000
Number of rows processed = 20000000
Number of rows processed = 21000000
Number of rows processed = 22000000
Number of rows processed = 23000000
Number of rows processed = 24000000
Number of rows processed = 25000000


Optimizing the data structure of the adjacency list:

In [4]:
adjacency = [[]] * len(adjdict)

for k, v in adjdict.items():
    adjacency[k] = v

del adjdict

Saving the data:

In [5]:
with open('../data/cnpj2id.pkl'   , 'wb') as f:
    pickle.dump(cnpj2id   , f)

with open('../data/partner2id.pkl', 'wb') as f:
    pickle.dump(partner2id, f)

with open('../data/adjacency.pkl' , 'wb') as f:
    pickle.dump(adjacency , f)

Freeing the memory of objects:

In [6]:
del cnpj2id
del partner2id
del adjacency

## EXPLORING THE RESULTS

Checking the number of companies and partners:

In [7]:
print('Number of companies in the \'partners\' file =', partners.cnpj.nunique())
print('Number of partners  in the \'partners\' file =', partners.name_partner.nunique())

Number of companies in the 'partners' file = 14441622
Number of partners  in the 'partners' file = 14686766


Freeing up resources:

In [8]:
del partners

Loading the data:

In [9]:
with open('../data/cnpj2id.pkl'   , 'rb') as f:
    cnpj2id = pickle.load(f)

with open('../data/partner2id.pkl', 'rb') as f:
    partner2id = pickle.load(f)

with open('../data/adjacency.pkl' , 'rb') as f:
    adjacency = pickle.load(f)

Confirming the number of companies and partners:

In [10]:
print('Number of companies in the \'cnpj2id\'    file =', len(cnpj2id   ))
print('Number of partners  in the \'partner2id\' file =', len(partner2id))

Number of companies in the 'cnpj2id'    file = 14441622
Number of partners  in the 'partner2id' file = 14686766


Obtaining the partners of a given company:

In [11]:
CNPJ = 85061877


idx      = cnpj2id[CNPJ]
partners = set(adjacency[idx])

print(f'CNPJ members {CNPJ}:')
print()
for nome, idx in partner2id.items():
    if idx in partners:
        partners.remove(idx)

        print(nome)

    if not partners:
        break

CNPJ members 85061877:

RICARDO BAULE ROSSI
EDUARDO GUILHERME BAULE ROSSI
