## Carregar bibliotecas e banco de dados

In [5]:
import pandas as pd
import os
import numpy as np

In [6]:
filename = "data.jsonl.gz"
path = os.path.abspath(f"../{filename}")
print("path: ", path)

df = pd.read_json(path, lines=True)

path:  c:\Users\dimit\projects\artigo-rp2\data.jsonl.gz


## Realizar limpeza dos dados

In [7]:
sets_used = ['jurisdiction', 'kind', 
             'date_published', 'docdb_id', 
             'biblio', 'families', 
             'legal_status', 'abstract']

# remoção de NAs
filtered_df = df.dropna(subset=sets_used)

display(filtered_df['biblio'].astype(str))

# filtro por publicações brasileiras
filtered_df = filtered_df[filtered_df['biblio'].astype(str).str.contains('BR')]
filtered_df = filtered_df[filtered_df['jurisdiction'].astype(str).str.contains('BR')]


3        {'publication_reference': {'jurisdiction': 'BR...
4        {'publication_reference': {'jurisdiction': 'BR...
5        {'publication_reference': {'jurisdiction': 'BR...
6        {'publication_reference': {'jurisdiction': 'BR...
7        {'publication_reference': {'jurisdiction': 'BR...
                               ...                        
49994    {'publication_reference': {'jurisdiction': 'BR...
49995    {'publication_reference': {'jurisdiction': 'BR...
49997    {'publication_reference': {'jurisdiction': 'BR...
49998    {'publication_reference': {'jurisdiction': 'BR...
49999    {'publication_reference': {'jurisdiction': 'BR...
Name: biblio, Length: 38630, dtype: object

## Extração de novos campos

### Texto do abstract

In [8]:

# extrai o campo 'text' do dicionário presente em cada linha da coluna 'abstract'
filtered_df['abstract_text'] = filtered_df['abstract'].apply(lambda x: x[0]['text'] if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) and 'text' in x[0] else None)

### Classificações IPCR e CPC

In [9]:
# extrai triplas (symbol, value, position) de classificações IPCR e CPC
def _triples_from_classifications_block(block):
    if not isinstance(block, dict):
        return []
    cls = block.get('classifications') if 'classifications' in block else None
    if not isinstance(cls, list):
        return []
    triples = []
    for c in cls:
        if not isinstance(c, dict):
            continue
        triples.append((c.get('symbol'), c.get('classification_value'), c.get('classification_symbol_position')))
    return triples

def extract_triples_from_biblio(biblio):
    if not isinstance(biblio, dict):
        return [], []
    ipcr_block = biblio.get('classifications_ipcr') or biblio.get('classifications_ipcr', {})
    cpc_block = biblio.get('classifications_cpc') or biblio.get('classifications_cpc', {})
    ipcr_triples = _triples_from_classifications_block(ipcr_block)
    cpc_triples = _triples_from_classifications_block(cpc_block)
    return ipcr_triples, cpc_triples

filtered_df[['ipcr_triples', 'cpc_triples']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_triples_from_biblio(b)))

filtered_df['all_triples'] = filtered_df.apply(lambda r: (r['ipcr_triples'] or []) + (r['cpc_triples'] or []), axis=1)

### Inventors

In [10]:
def extract_inventors_from_biblio(biblio):
    """Return (detailed_list, names_list) where detailed_list is list of {'residence': ..., 'name': ...} and names_list is list of names."""
    if not isinstance(biblio, dict):
        return [], []
    parties = biblio.get('parties') if 'parties' in biblio else {}
    if not isinstance(parties, dict):
        return [], []
    invs = parties.get('inventors')
    if not isinstance(invs, list):
        return [], []
    detailed = []
    names = []
    for inv in invs:
        if not isinstance(inv, dict):
            continue
        residence = inv.get('residence')
        name = None
        if isinstance(inv.get('extracted_name'), dict):
            name = inv['extracted_name'].get('value')
        if not name:
            name = inv.get('name') or (inv.get('extracted') if isinstance(inv.get('extracted'), str) else None)
        detailed.append({'residence': residence, 'name': name})
        if name:
            names.append(name)
    return detailed, names

filtered_df[['inventors_detailed','inventor_names']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_inventors_from_biblio(b)))

# --- new: extract invention title and owners ---
def extract_invention_title(biblio):
    """Return the invention title text (first entry) or None"""
    if not isinstance(biblio, dict):
        return None
    it = biblio.get('invention_title')
    if isinstance(it, list) and len(it) > 0:
        first = it[0]
        if isinstance(first, dict):
            return first.get('text') or first.get('value')
        if isinstance(first, str):
            return first
    return None

filtered_df['invention_title_text'] = filtered_df['biblio'].apply(extract_invention_title)

### Patentes cited by

In [11]:
# Extrai patent_count e lens_ids de filtered_df['biblio']
def extract_patents_info(biblio):
    if not isinstance(biblio, dict):
        return 0, []
    cb = biblio.get('cited_by') or {}
    # padrão esperado: {'patents': [...], 'patent_count': N}
    patents = []
    count = 0
    if isinstance(cb, dict):
        if isinstance(cb.get('patents'), list):
            patents = cb.get('patents')
            count = cb.get('patent_count', len(patents))
        elif isinstance(cb.get('patents'), dict):
            patents = list(cb.get('patents').values())
            count = cb.get('patent_count', len(patents))
        else:
            # talvez cb itself seja a lista de patents ou um mapeamento
            if isinstance(cb.get('patents'), list):
                patents = cb.get('patents')
                count = len(patents)
            else:
                # fallback: count only
                count = cb.get('patent_count', 0)
                patents = []
    elif isinstance(cb, list):
        patents = cb
        count = len(patents)
    else:
        count = 0

    lens_ids = []
    for p in patents:
        if not isinstance(p, dict):
            continue
        # lens_id frequentemente na raiz do registro de patente
        lid = p.get('lens_id') or (p.get('document_id', {}) or {}).get('lens_id')
        # às vezes lens_id pode estar como 'lens id' ou similar - tratar básico
        if not lid:
            lid = p.get('lens id') or p.get('lens')
        if lid:
            lens_ids.append(lid)
    return count, lens_ids

# aplicar ao DataFrame
filtered_df[['patent_count','patent_lens_ids']] = filtered_df['biblio'].apply(lambda b: pd.Series(extract_patents_info(b)))

filtered_df = filtered_df.reset_index(drop=True)

# visualização rápida
filtered_df[filtered_df['patent_count'] > 6][['docdb_id','patent_count','patent_lens_ids']].head()

Unnamed: 0,docdb_id,patent_count,patent_lens_ids
2248,410568163,7,"[009-112-160-556-731, 126-749-947-864-359, 022..."
10065,279427515,10,"[088-469-350-389-118, 066-068-825-399-403, 083..."
10067,451354979,8,"[030-613-929-122-614, 054-771-746-098-337, 108..."
10082,522818179,26,"[064-856-910-155-483, 116-462-036-170-646, 064..."
14389,449883948,15,"[043-472-868-631-815, 090-067-059-203-736, 061..."


### Informações de status e expiração

In [12]:
import ast

# extrai patent_status e application_expiry_date da coluna 'legal_status'
def extract_legal_status_info(legal_status):
    if legal_status is None:
        return None, None
    # já é dict
    if isinstance(legal_status, dict):
        return legal_status.get('patent_status'), legal_status.get('application_expiry_date')
    # se for string, tentar avaliar
    if isinstance(legal_status, str):
        try:
            obj = ast.literal_eval(legal_status)
            if isinstance(obj, dict):
                return obj.get('patent_status'), obj.get('application_expiry_date')
        except Exception:
            # não foi possível interpretar
            return None, None
    return None, None

# aplicar e criar novas colunas
filtered_df[['patent_status','application_expiry_date']] = filtered_df['legal_status'].apply(lambda x: pd.Series(extract_legal_status_info(x)))

# quick check
filtered_df[['docdb_id','patent_status','application_expiry_date']].head()

Unnamed: 0,docdb_id,patent_status,application_expiry_date
0,558602417,PENDING,
1,383346957,DISCONTINUED,2013-09-03
2,383323608,DISCONTINUED,2011-07-26
3,569902860,ACTIVE,
4,586397134,DISCONTINUED,2024-09-10


## Análise Inicial

In [13]:
display(filtered_df.columns)

Index(['lens_id', 'jurisdiction', 'doc_number', 'kind', 'date_published',
       'doc_key', 'docdb_id', 'lang', 'biblio', 'families', 'legal_status',
       'publication_type', 'abstract', 'abstract_text', 'ipcr_triples',
       'cpc_triples', 'all_triples', 'inventors_detailed', 'inventor_names',
       'invention_title_text', 'patent_count', 'patent_lens_ids',
       'patent_status', 'application_expiry_date'],
      dtype='object')