In [9]:
import pandas as pd
import os
import numpy as np

In [5]:
filename = "data.jsonl.gz"
path = os.path.abspath(f"../{filename}")
print("path: ", path)

df = pd.read_json(path, lines=True)

path:  c:\Users\dimit\projects\artigo-rp2\data.jsonl.gz


In [None]:
sets_used = ['jurisdiction', 'kind', 
             'date_published', 'docdb_id', 
             'biblio', 'families', 
             'legal_status', 'abstract']

# remoção de NAs
filtered_df = df.dropna(subset=sets_used)

# filtro por publicações brasileiras
filtered_df = filtered_df[filtered_df['biblio'].astype(str).str.contains('BR')]
filtered_df = filtered_df[filtered_df['jurisdiction'].astype(str).str.contains('BR')]

# extrai o campo 'text' do dicionário presente em cada linha da coluna 'abstract'
filtered_df['abstract_text'] = filtered_df['abstract'].apply(lambda x: x[0]['text'] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) and 'text' in x[0] else None)

Unnamed: 0,inventors_detailed,inventor_names,ipcr_triples,cpc_triples
3,"[{'residence': 'DE', 'name': 'TORRAS-PIQUÉ JOR...",[TORRAS-PIQUÉ JORGE],"[(B01D17/00, I, F), (B01D21/24, I, L), (C02F1/...","[(B01D21/2427, I, F), (B01D21/2411, I, L), (B0..."
4,"[{'residence': None, 'name': 'KAWATANI JORGE S...",[KAWATANI JORGE SABURO],"[(E02B9/00, I, F)]","[(Y02E10/20, A, L)]"
5,"[{'residence': None, 'name': 'SILVA CARLOS AUG...","[SILVA CARLOS AUGUSTO VERAS DA, SILVA ALVARO V...","[(E04H17/06, I, None)]",[]
6,"[{'residence': 'BR', 'name': 'FLÁVIO APARECIDO...",[FLÁVIO APARECIDO PERES],"[(E06B11/02, I, F), (E05F15/60, I, L), (F16H57...",[]
7,"[{'residence': 'PL', 'name': 'NOWAK ZYGMUNT'}]",[NOWAK ZYGMUNT],"[(F03B13/00, I, F), (E02B9/00, I, L), (F03B15/...","[(E02B9/00, A, L), (F05B2240/40, A, L), (F03B1..."


In [None]:
# extrai triplas (symbol, value, position) de classificações IPCR e CPC
def _triples_from_classifications_block(block):
    if not isinstance(block, dict):
        return []
    cls = block.get('classifications') if 'classifications' in block else None
    if not isinstance(cls, list):
        return []
    triples = []
    for c in cls:
        if not isinstance(c, dict):
            continue
        triples.append((c.get('symbol'), c.get('classification_value'), c.get('classification_symbol_position')))
    return triples

def extract_triples_from_biblio(biblio):
    if not isinstance(biblio, dict):
        return [], []
    ipcr_block = biblio.get('classifications_ipcr') or biblio.get('classifications_ipcr', {})
    cpc_block = biblio.get('classifications_cpc') or biblio.get('classifications_cpc', {})
    ipcr_triples = _triples_from_classifications_block(ipcr_block)
    cpc_triples = _triples_from_classifications_block(cpc_block)
    return ipcr_triples, cpc_triples

filtered_df[['ipcr_triples', 'cpc_triples']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_triples_from_biblio(b)))

filtered_df['all_triples'] = filtered_df.apply(lambda r: (r['ipcr_triples'] or []) + (r['cpc_triples'] or []), axis=1)

In [None]:
def extract_inventors_from_biblio(biblio):
    """Return (detailed_list, names_list) where detailed_list is list of {'residence': ..., 'name': ...} and names_list is list of names."""
    if not isinstance(biblio, dict):
        return [], []
    parties = biblio.get('parties') if 'parties' in biblio else biblio.get('parties', {})
    if not isinstance(parties, dict):
        return [], []
    invs = parties.get('inventors')
    if not isinstance(invs, list):
        return [], []
    detailed = []
    names = []
    for inv in invs:
        if not isinstance(inv, dict):
            continue
        residence = inv.get('residence')
        name = None
        if isinstance(inv.get('extracted_name'), dict):
            name = inv['extracted_name'].get('value')
        if not name:
            name = inv.get('name') or (inv.get('extracted') if isinstance(inv.get('extracted'), str) else None)
        detailed.append({'residence': residence, 'name': name})
        if name:
            names.append(name)
    return detailed, names

filtered_df[['inventors_detailed','inventor_names']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_inventors_from_biblio(b)))


In [None]:
def extract_inventors_from_biblio(biblio):
    """Return (detailed_list, names_list) where detailed_list is list of {'residence': ..., 'name': ...} and names_list is list of names."""
    if not isinstance(biblio, dict):
        return [], []
    parties = biblio.get('parties') if 'parties' in biblio else {}
    if not isinstance(parties, dict):
        return [], []
    invs = parties.get('inventors')
    if not isinstance(invs, list):
        return [], []
    detailed = []
    names = []
    for inv in invs:
        if not isinstance(inv, dict):
            continue
        residence = inv.get('residence')
        name = None
        if isinstance(inv.get('extracted_name'), dict):
            name = inv['extracted_name'].get('value')
        if not name:
            name = inv.get('name') or (inv.get('extracted') if isinstance(inv.get('extracted'), str) else None)
        detailed.append({'residence': residence, 'name': name})
        if name:
            names.append(name)
    return detailed, names

filtered_df[['inventors_detailed','inventor_names']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_inventors_from_biblio(b)))

# --- new: extract invention title and owners ---
def extract_invention_title(biblio):
    """Return the invention title text (first entry) or None"""
    if not isinstance(biblio, dict):
        return None
    it = biblio.get('invention_title')
    if isinstance(it, list) and len(it) > 0:
        first = it[0]
        if isinstance(first, dict):
            return first.get('text') or first.get('value')
        if isinstance(first, str):
            return first
    return None


# apply invention title and owners extraction
filtered_df['invention_title_text'] = filtered_df['biblio'].apply(extract_invention_title)

# quick check: show new columns
filtered_df[['invention_title_text']].head()

Unnamed: 0,invention_title_text
3,Dispositivo de sedimentação
4,canal com turbinas acoplados nos geradores par...
5,"Mola tensionadora de fio de arame farpado, de ..."
6,Sistema contentor de lubrificantes para automa...
7,MÉTODO DE GERAÇÃO DE ENERGIA ELÉTRICA E SISTEM...


In [65]:
filtered_df['biblio'].iloc[0]

{'publication_reference': {'jurisdiction': 'BR',
  'doc_number': '112020025554',
  'kind': 'A2',
  'date': '2021-09-28'},
 'application_reference': {'jurisdiction': 'BR',
  'doc_number': '112020025554',
  'kind': 'A',
  'date': '2020-02-05'},
 'priority_claims': {'claims': [{'jurisdiction': 'EP',
    'doc_number': '2020052846',
    'kind': 'W',
    'date': '2020-02-05',
    'sequence': 1},
   {'jurisdiction': 'DE',
    'doc_number': '102019203116',
    'kind': 'A',
    'date': '2019-03-07',
    'sequence': 2}],
  'earliest_claim': {'date': '2019-03-07'}},
 'invention_title': [{'text': 'Dispositivo de sedimentação', 'lang': 'pt'}],
 'parties': {'applicants': [{'residence': 'DE',
    'extracted_name': {'value': '3P TECHNIK FILTERSYSTEME GMBH'}},
   {'residence': 'DE', 'extracted_name': {'value': 'H2O RES GMBH'}}],
  'inventors': [{'residence': 'DE',
    'sequence': 1,
    'extracted_name': {'value': 'TORRAS-PIQUÉ JORGE'}}],
  'owners_all': [{'recorded_date': '2025-02-18',
    'execution_