In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv

In [5]:
load_dotenv("../client/.env.local")

True

In [8]:
scanr_url = "https://cluster-production.elasticsearch.dataesr.ovh/scanr-patents/_search"
scanr_key = f'Basic {os.getenv("VITE_API_KEY")}'
scanr_headers = {"Authorization": scanr_key}

source_fields = ["id", "inpadocFamily", "applicants", "inventors", "title", "cpc", "year"]


def scanr_patents(size=10, query=None):
    body = {
        "size": size,
        "_source": source_fields,
        "query": {
            "bool": {
                "must": [
                    {
                        "query_string": {"query": query or "*"},
                    },
                ],
            },
        },
    }

    res = requests.post(scanr_url, json=body, headers=scanr_headers)
    if res.status_code == 200:
        return res.json()
    else:
        print(f"error {res.status_code}: {res.reason}")
        print(res.text)
        return None

In [15]:
res = scanr_patents(query="biodiversity")
res

{'took': 4634,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 7, 'relation': 'eq'},
  'max_score': 14.744541,
  'hits': [{'_index': 'scanr-patents-20240530',
    '_id': 'cHimyI8BZJJbNSwJ5oC0',
    '_score': 14.744541,
    '_ignored': ['summary.en.keyword'],
    '_source': {'id': '44243408',
     'inpadocFamily': '332873297',
     'applicants': [{'name': 'Beynet Jean Marc',
       'type': 'person',
       'country': 'FR'},
      {'name': 'Brl Ingenierie',
       'type': 'organisation',
       'country': 'FR',
       'ids': [{'id': '391484862', 'type': 'siren'}]},
      {'name': 'Fraysse Nicolas', 'type': 'person', 'country': 'FR'},
      {'name': 'De Monbrison David', 'type': 'person', 'country': 'FR'},
      {'name': "Le Groupement D'Interet Scientifique Gis Posidonie",
       'type': 'organisation',
       'country': 'FR',
       'ids': [{'id': '338863699', 'type': 'siren'}]}],
     'inventors': [],
     'title':

In [16]:
patents = [patent.get("_source") for patent in res.get("hits").get("hits")]
patents

[{'id': '44243408',
  'inpadocFamily': '332873297',
  'applicants': [{'name': 'Beynet Jean Marc',
    'type': 'person',
    'country': 'FR'},
   {'name': 'Brl Ingenierie',
    'type': 'organisation',
    'country': 'FR',
    'ids': [{'id': '391484862', 'type': 'siren'}]},
   {'name': 'Fraysse Nicolas', 'type': 'person', 'country': 'FR'},
   {'name': 'De Monbrison David', 'type': 'person', 'country': 'FR'},
   {'name': "Le Groupement D'Interet Scientifique Gis Posidonie",
    'type': 'organisation',
    'country': 'FR',
    'ids': [{'id': '338863699', 'type': 'siren'}]}],
  'inventors': [],
  'title': {'fr': '',
   'en': 'Artificial, modular, evolutionary, habitual device for use in aquatic environment to provide e.g. shelter for flora, has elementary module arranged in cells of support, and including reliefs favorizing development of biodiversity',
   'default': ''},
  'cpc': {'groupe': [{'code': 'A01K61/70',
     'label': 'Culture of aquatic animals - Artificial fishing banks or reefs

In [48]:
from itertools import combinations


def persons_links(patents):
    links = {}
    for patent in patents:
        applicants = patent.get("applicants", [])
        inventors = patent.get("inventors", [])

        persons = list(
            set([applicant.get("name") for applicant in applicants + inventors if applicant.get("type") == "person"])
        )
        co_persons = [tuple(sorted(combination)) for combination in combinations(persons, 2)]

        for co_element in co_persons:
            link_id = "---".join(co_element)
            source_id = co_element[0]
            target_id = co_element[1]

            if link_id not in links:
                links[link_id] = {"source_id": source_id, "target_id": target_id, "strength": 1}
            else:
                links[link_id]["strength"] += 1
    return links


def organizations_links(patents):
    links = {}
    for patent in patents:
        applicants = patent.get("applicants", [])

        organizations = list(
            set([applicant.get("name") for applicant in applicants if applicant.get("type") == "organisation"])
        )
        co_organizations = [tuple(sorted(combination)) for combination in combinations(organizations, 2)]

        for co_element in co_organizations:
            link_id = "---".join(co_element)
            source_id = co_element[0]
            target_id = co_element[1]

            if link_id not in links:
                links[link_id] = {"source_id": source_id, "target_id": target_id, "strength": 1}
            else:
                links[link_id]["strength"] += 1
    return links


def cpc_ss_classe_links(patents):
    links = {}
    for patent in patents:
        ss_classes = patent.get("cpc", {}).get("ss_classe", [])
        ss_classes = [f'{classe.get("code")}###{classe.get("label")}' for classe in ss_classes]
        co_ss_classes = [tuple(sorted(combinations)) for combinations in combinations(ss_classes, 2)]

        for co_element in co_ss_classes:
            link_id = "---".join(co_element)
            source_name = co_element[0].split("###")[1]
            target_name = co_element[1].split("###")[1]

            if link_id not in links:
                links[link_id] = {"source_id": source_name, "target_id": target_name, "strength": 1}
            else:
                links[link_id]["strength"] += 1

    return links

In [43]:
persons_links(patents[0:2])

{'De Monbrison David---Fraysse Nicolas': {'source_id': 'De Monbrison David',
  'target_id': 'Fraysse Nicolas',
  'strength': 1},
 'Beynet Jean Marc---Fraysse Nicolas': {'source_id': 'Beynet Jean Marc',
  'target_id': 'Fraysse Nicolas',
  'strength': 1},
 'Beynet Jean Marc---De Monbrison David': {'source_id': 'Beynet Jean Marc',
  'target_id': 'De Monbrison David',
  'strength': 1},
 'Thorel Jean Noel---Thorel Jean-Noël': {'source_id': 'Thorel Jean Noel',
  'target_id': 'Thorel Jean-Noël',
  'strength': 1}}

In [53]:
df = pd.DataFrame.from_dict(cpc_ss_classe_links(patents), orient="index")
df.to_json("patents_ss_classe_links.json", orient="records")