# BIPARTITE GRAPH

In [2]:
import pyarrow.parquet as pq


PARQUET_BUSINESS = '../data/parquet/business.parquet'

table = pq.ParquetFile(PARQUET_BUSINESS)
print(table.num_row_groups)

64


In [1]:
import pandas as pd

In [2]:
OUTPUT_PARTNERS  = '../data/parquet/partners.parquet'
OUTPUT_COMPANIES = '../data/parquet/companies.parquet'
OUTPUT_BUSINESS  = '../data/parquet/business.parquet'

In [6]:
chunk = pd.read_csv(
    '../data/csv/empresas/empresas0.csv',
    sep=';',
    usecols=[4],
    names=['capital'],
    nrows=1000000,
    low_memory=False      ,
    encoding='latin-1'    ,
    on_bad_lines='skip'   ,
)

chunk.capital = (
    chunk.capital
    .str
    .replace(r',.*', '', regex=True)
    .astype('int64')
)

In [7]:
chunk.capital[chunk.capital > 4294967290]

110120     26205000000
234296     18000000000
390266      7550178543
497019     12672470977
500519      5619073324
800981    172964347313
869063      8043222080
869329     24164007439
875075     21691206177
960158    422451360001
990413      6109562304
Name: capital, dtype: int64

In [3]:
import pyarrow.parquet as pq
import pandas as pd


parquet_file = pq.ParquetFile(OUTPUT_COMPANIES)

print(f'Total de row_groups: {parquet_file.num_row_groups}')

duplicados_por_rg = []
for i in range(parquet_file.num_row_groups):
    print(f'Processando row_group {i}...')

    table = parquet_file.read_row_group(i, columns=['cnpj'])
    df = table.to_pandas()

    repetidos = df[df.duplicated(subset='cnpj', keep=False)]

    if not repetidos.empty:
        repetidos['row_group'] = i
        duplicados_por_rg.append(repetidos)

if duplicados_por_rg:
    repetidos_final = pd.concat(duplicados_por_rg, ignore_index=True)
    print('Duplicados dentro dos row_groups encontrados:')
    print(repetidos_final)
else:
    print('Nenhum CNPJ duplicado dentro dos row_groups foi encontrado.')


Total de row_groups: 13
Processando row_group 0...
Processando row_group 1...
Processando row_group 2...
Processando row_group 3...
Processando row_group 4...
Processando row_group 5...
Processando row_group 6...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repetidos['row_group'] = i


Processando row_group 7...
Processando row_group 8...
Processando row_group 9...
Processando row_group 10...
Processando row_group 11...
Processando row_group 12...
Duplicados dentro dos row_groups encontrados:
       cnpj  row_group
0  10959550          6
1  10959550          6
2  10959550          6


In [3]:
business = pd.read_parquet(OUTPUT_BUSINESS, columns=['cnpj_order'])
business

Unnamed: 0,cnpj_order
0,1
1,1
2,1
3,1
4,1
...,...
66349370,1
66349371,1
66349372,1
66349373,1


In [4]:
business.max()

cnpj_order    9999
dtype: int16

In [10]:
len(repetidos)

3

In [None]:
duplicated_cnpjs = companies[companies.duplicated(subset='cnpj', keep=False)]['cnpj']

print(duplicated_cnpjs)

In [15]:
partners[partners.name_partner.str.len() < 3]

Unnamed: 0,cnpj,name_partner,start_date
8,2450527,!,19010101
2048975,2327780,S,19980109


In [None]:
# ==============================================================
# PART 4 - Reload both files and rewrite with higher compression
# ==============================================================
print('Rewriting parquet files with pyarrow compression...')


partners.sort_values(
    by=[
        'start_date'  ,
        'name_partner',
    ],
    inplace=True,
)
partners.to_parquet(
    OUTPUT_PARTNERS ,
    engine='pyarrow',
    index=False     ,
)

del partners


business = pd.read_parquet(OUTPUT_BUSINESS)

business.sort_values(
    by=[
        'closing_date',
        'opening_date',
        'cep'         ,
    ],
    inplace=True,
)
business.to_parquet(
    OUTPUT_BUSINESS ,
    engine='pyarrow',
    index=False     ,
)

del business


## PREPARING THE ENVIRONMENT

## BIPARTITE GRAPH

Generating the bipartite graph:

In [3]:
cnpj2id    = {}
partner2id = {}
adjdict    = defaultdict(list)

cnpj_id  = 0
cnpj_idx = 0
partner_id  = 0
partner_idx = 0

for idx, (cnpj, partner) in enumerate(partners.itertuples(index=False)):
    if cnpj not in cnpj2id:
        cnpj_idx  = cnpj_id
        cnpj_id  += 1

        cnpj2id[cnpj] = cnpj_idx
    else:
        cnpj_idx = cnpj2id[cnpj]

    if partner not in partner2id:
        partner_idx  = partner_id
        partner_id  += 1

        partner2id[partner] = partner_idx
    else:
        partner_idx = partner2id[partner]

    adjdict[cnpj_idx].append(partner_idx)

    if idx % 1000000 == 999999:
        print('Number of rows processed =', idx + 1)

Number of rows processed = 1000000
Number of rows processed = 2000000
Number of rows processed = 3000000
Number of rows processed = 4000000
Number of rows processed = 5000000
Number of rows processed = 6000000
Number of rows processed = 7000000
Number of rows processed = 8000000
Number of rows processed = 9000000
Number of rows processed = 10000000
Number of rows processed = 11000000
Number of rows processed = 12000000
Number of rows processed = 13000000
Number of rows processed = 14000000
Number of rows processed = 15000000
Number of rows processed = 16000000
Number of rows processed = 17000000
Number of rows processed = 18000000
Number of rows processed = 19000000
Number of rows processed = 20000000
Number of rows processed = 21000000
Number of rows processed = 22000000
Number of rows processed = 23000000
Number of rows processed = 24000000
Number of rows processed = 25000000


Optimizing the data structure of the adjacency list:

In [4]:
adjacency = [[]] * len(adjdict)

for k, v in adjdict.items():
    adjacency[k] = v

del adjdict

Saving the data:

In [5]:
with open('../data/cnpj2id.pkl'   , 'wb') as f:
    pickle.dump(cnpj2id   , f)

with open('../data/partner2id.pkl', 'wb') as f:
    pickle.dump(partner2id, f)

with open('../data/adjacency.pkl' , 'wb') as f:
    pickle.dump(adjacency , f)

Freeing the memory of objects:

In [6]:
del cnpj2id
del partner2id
del adjacency

## EXPLORING THE RESULTS

Checking the number of companies and partners:

In [7]:
print('Number of companies in the \'partners\' file =', partners.cnpj.nunique())
print('Number of partners  in the \'partners\' file =', partners.name_partner.nunique())

Number of companies in the 'partners' file = 14441622
Number of partners  in the 'partners' file = 14686766


Freeing up resources:

In [8]:
del partners

Loading the data:

In [9]:
with open('../data/cnpj2id.pkl'   , 'rb') as f:
    cnpj2id = pickle.load(f)

with open('../data/partner2id.pkl', 'rb') as f:
    partner2id = pickle.load(f)

with open('../data/adjacency.pkl' , 'rb') as f:
    adjacency = pickle.load(f)

Confirming the number of companies and partners:

In [10]:
print('Number of companies in the \'cnpj2id\'    file =', len(cnpj2id   ))
print('Number of partners  in the \'partner2id\' file =', len(partner2id))

Number of companies in the 'cnpj2id'    file = 14441622
Number of partners  in the 'partner2id' file = 14686766


Obtaining the partners of a given company:

In [11]:
CNPJ = 85061877


idx      = cnpj2id[CNPJ]
partners = set(adjacency[idx])

print(f'CNPJ members {CNPJ}:')
print()
for nome, idx in partner2id.items():
    if idx in partners:
        partners.remove(idx)

        print(nome)

    if not partners:
        break

CNPJ members 85061877:

RICARDO BAULE ROSSI
EDUARDO GUILHERME BAULE ROSSI
