In [1]:
import pandas as pd
import json
import time
import os

In [None]:
# Criar diretórios necessários
os.makedirs('medicamentos', exist_ok=True)
os.makedirs('cnpjs', exist_ok=True)
os.makedirs('ggrem', exist_ok=True)

In [60]:
df = pd.read_csv('cnpjs.csv', dtype=str)

## Preprocessing

In [None]:
!pip install grequests
import grequests
import sys
del sys.modules["gevent.monkey"]
import grequests
import requests

In [4]:
def error_handler(request, exception):
  print("Exception:", exception)
  
def renew_cookies(old_cookies):
    main = requests.get("https://consultas.anvisa.gov.br/#/medicamentos/")
    with open("main.html", 'w') as f:
        f.write(main.text)
    new_cookies = dict(main.headers)["Set-Cookie"].split(";")[:3]
    if "FGTServer" in dict(main.headers)["Set-Cookie"]:
        new_cookies[1] = new_cookies[2].split(", ")[1]
        cookies = ";".join(new_cookies[:2])
    else:
        cookies = [old_cookies.split(";")[0]]
        cookies.append(new_cookies[0])
        cookies = ";".join(cookies)

    return cookies

# Reset stored cookies
# https://consultas.anvisa.gov.br/#/medicamentos/
cookies = ""

In [66]:
step = 1
total = len(df)
idx = 0
for curr in range(idx + step, total + 1, step):
	try:
		print(f"\rLoading {curr}/{total}", end='')
		# Renew header
		cookies = renew_cookies(cookies)
		header = {
              "Host": "consultas.anvisa.gov.br",
              "Referer": "https://consultas.anvisa.gov.br/",
              "Authorization": "Guest",
              "Cookie": cookies,
              "Connection": "keep-alive",
              "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
              }
		basic_requests = [grequests.get(f"https://consultas.anvisa.gov.br/api/consulta/medicamento/produtos/?count=1000&filter[cnpj]={row['CNPJ']}&page=1",
                    headers=header) for _, row in df[idx:curr].iterrows()]
    
		responses = grequests.map(basic_requests, exception_handler=error_handler)

		advanced_requests = []
		i = 0
		names_cnpj = []
		for response in responses:
			with open(f"cnpjs/{df['CNPJ'][idx + i]}.json", 'w') as f:
				f.write(response.text)
			i+=1
		time.sleep(2)
	except Exception as e:
		print("Outside error", e)
	finally:
		idx += step


Loading 428/427

In [68]:
data = {
	'codigo_anvisa': [],
	'registro': [],
	'nome': [],
	'cnpj': [],
	'razao_social': []
}
for cnpjfile in os.listdir('cnpjs'):
	with open(f'cnpjs/{cnpjfile}', 'r') as f:
		results = json.load(f)
		for result in results['content']:
			data['codigo_anvisa'].append(result['produto']['codigo'])
			data['registro'].append(result['produto']['numeroRegistro'])
			data['nome'].append(result['produto']['nome'])
			data['cnpj'].append(result['empresa']['cnpj'])
			data['razao_social'].append(result['empresa']['razaoSocial'])
df = pd.DataFrame(data)
df.to_csv('medicamentos_auto.csv', index=False)

In [35]:
import random
df = pd.read_csv('medicamentos_auto.csv')
for n, row in df.iterrows():
	if os.path.exists(f"medicamentos/{row['registro']}.json"):
		with open(f"medicamentos/{row['registro']}.json", 'r') as f:
			try:
				content = json.load(f)
				if "codigoProduto" in content.keys():
					continue
				else:
					print()
					print(f"\nRetry {row['registro']}")
			except Exception as e:
				print()
				print(f"Error {row['registro']}")
				
	print(f"\rLoading {n + 1}/{len(df)}", end='')
	codigo = row['codigo_anvisa']
	cookies = renew_cookies(cookies) 
	advanced_requests = []
	header = {
              "Host": "consultas.anvisa.gov.br",
              "Referer": "https://consultas.anvisa.gov.br/",
              "Authorization": "Guest",
              "Cookie": cookies,
              "Connection": "keep-alive",
              "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
              }
	advanced_requests.append(grequests.get(f"https://consultas.anvisa.gov.br/api/consulta/medicamento/produtos/codigo/{codigo}",
									headers=header))
	responses = grequests.map(advanced_requests, exception_handler=error_handler)
	for response in responses:
		with open(f"medicamentos/{row['registro']}.json", 'w') as f:
			f.write(response.text)
	time.sleep(random.randint(2, 4))

Retry 103700523
Loading 6959/27354Retry 118610226
Loading 13559/27354Retry 100490101
Loading 18484/27354Retry 109740177
Loading 26865/27354

In [2]:
# principio_ativo,nome,cnpj,nome_cnpj,categoria,data,vencimento,registro,situacao,forma,apresentacao
df_dados = {
	"registro": [],
	"nome": [],
	"cnpj": [],
	"razao_social": [],
	"categoria": [],
    "principio_ativo": [],
	"situacao": [],
	"data": [],
	"vencimento": [],
	"forma": [],
    "apresentacao": []
}
idx = 1
for filename in os.listdir("medicamentos"):
	print(f"\rLoading {idx}/{len(os.listdir('medicamentos'))}", end='')
	with open(f"medicamentos/{filename}", 'r') as f:
		try:
			dados = json.load(f)
			nome = dados['nomeComercial']
			cnpj = dados['empresa']['cnpj']
			razao_social = dados['empresa']['razaoSocial']
			categoria = dados['categoriaRegulatoria']
			principio_ativo = dados['principioAtivo']

			vencimento = "N/A"
			if dados['dataVencimentoRegistro'] is not None:
				vencimento = dados['dataVencimentoRegistro'].split("T")[0]
		
			for apresentacao in dados['apresentacoes']:
				registro = apresentacao['registro']
				if apresentacao['ativa']:
					situacao = "Válido"
				elif apresentacao['inativa']:
					situacao = "Caduco/Cancelado"
				if apresentacao['emAnalise']:
					situacao = "Em análise"
				
				data = "N/A"
				if apresentacao['dataPublicacao'] is not None:
					data = apresentacao['dataPublicacao'].split("T")[0]
				
				forma = ",".join(apresentacao['formasFarmaceuticas'])
				
				df_dados['registro'].append(registro)
				df_dados['nome'].append(nome)
				df_dados['cnpj'].append(cnpj)
				df_dados['razao_social'].append(razao_social)
				df_dados['categoria'].append(categoria)
				df_dados['principio_ativo'].append(principio_ativo)
				df_dados['situacao'].append(situacao)
				df_dados['data'].append(data)
				df_dados['vencimento'].append(vencimento)
				df_dados['forma'].append(forma)
				df_dados['apresentacao'].append(apresentacao['apresentacao'])
		except Exception as e:
			print(f"Error {filename}")

Error 103700523.json
Error 100490101.json
Error 118610226.json
Error 109740177.json


In [3]:
df = pd.DataFrame(df_dados)
df.fillna('N/A').to_csv("medicamentos_completo.csv")

Adicionar informação do GGREM

In [4]:
years = []
years_pre = []
for year in os.listdir('ggrem'):
    if year in ['cmed_2013.csv', 'cmed_2014.csv', 'cmed_2015.csv']:
        years_pre.append(pd.read_csv(f'ggrem/{year}', dtype=object))
    else:
        years.append(pd.read_csv(f'ggrem/{year}', dtype=object))
        

In [5]:
ggrem = []
anos = {
    '2013': [],
    '2014': [],
    '2015': [],
    '2016': [],
    '2017': [],
    '2018': [],
    '2019': [],
    '2020': [],
    '2021': [],
    '2022': [],
    '2023': []
}

for n, row in df.iterrows():
    registro = row['registro']
    found = False
    print(f"\r Row {n} Locating registro", registro, end='')
    matching_ggrem = ''
    for year in years:
        if registro in year['REGISTRO'].values:
            if not found:
                matching_ggrem = year[year['REGISTRO'] == registro].iloc[0]['CÓDIGO GGREM']
                ggrem.append(matching_ggrem)
                found = True
            anos[year['ANO'].iloc[0]].append(1)
        else:
            anos[year['ANO'].iloc[0]].append(0)
    if found:
        for year in years_pre:
            if matching_ggrem in year['CÓDIGO GGREM'].values:
                anos[year['ANO'].iloc[0]].append(1)
            else:
                anos[year['ANO'].iloc[0]].append(0)
    else:
        ggrem.append('N/A')
        anos['2013'].append(0)
        anos['2014'].append(0)
        anos['2015'].append(0)

 Row 184451 Locating registro 1514300330127

In [6]:
df['ggrem'] = ggrem
df['2013'] = anos['2013']
df['2014'] = anos['2014']
df['2015'] = anos['2015']
df['2016'] = anos['2016']
df['2017'] = anos['2017']
df['2018'] = anos['2018']
df['2019'] = anos['2019']
df['2020'] = anos['2020']
df['2021'] = anos['2021']
df['2022'] = anos['2022']
df['2023'] = anos['2023']
df.fillna('N/A', inplace=True)
df.to_csv('tabela_completa_ggrem.csv', index=False)

Avaliar quantidade de medicamentos com GGREM

In [None]:
df = pd.read_csv('tabela_completa_ggrem.csv', dtype=object)
df.fillna('N/A', inplace=True)
print("Not found GGREM: ", len(df[df['ggrem'] == 'N/A']))
print("Found GGREM: ", len(df[df['ggrem'] != 'N/A']))
print(f"Total {len(df[df['ggrem'] != 'N/A'])/len(df)*100:.2f}% found.")

df[df['ggrem'] == 'N/A'].to_csv('ggrem_not_found.csv', index=False)
df[df['ggrem'] != 'N/A'].to_csv('ggrem_found.csv', index=False)