Function used to extract data from xml. 

xml.etre package is by default shipped with python.

In [None]:
from pprint import pprint as pp
import xml.etree.ElementTree as ET

# Parse the XML file


def parse_my_file(path:str) -> dict : 
    #Define a function to extract text from XML elements
    def get_text(elem):
        return elem.text.strip() if elem is not None else None
    
    tree = ET.parse(path)
    root = tree.getroot()

    # Define a dictionary to hold the data
    data = {}

    # Extract the data from the XML and store it in the dictionary
    data['PublicationNumber'] = get_text(root.find('PublicationNumber'))
    data['Title'] = get_text(root.find('Title'))
    data['SubDatabase'] = get_text(root.find('SubDatabase'))
    data['Inventor'] = [get_text(elem) for elem in root.findall('.//Inventor/Name')]
    data['Applicant'] = [get_text(elem) for elem in root.findall('.//Applicant/Name')]
    data['RequestedPatent'] = get_text(root.find('RequestedPatent'))
    data['ApplicationNumber'] = get_text(root.find('.//ApplicationElem/Number'))
    data['ApplicationDate'] = get_text(root.find('ApplicationDate'))
    data['PriorityNumber'] = [get_text(elem) for elem in root.findall('.//PriorityNumber')]
    data['PriorityDate'] = [get_text(elem) for elem in root.findall('.//PriorityDate')]
    data['IPC'] = [get_text(elem) for elem in root.findall('.//IPC/Class')]
    data['NCL'] = [get_text(elem) for elem in root.findall('.//NCL/Class')]
    data['Abstract'] = get_text(root.find('Abstract'))
    data['Claims'] = [get_text(elem) for elem in root.findall('.//Claims/P')]
    return data



This function use the elasticSearch package to send the data to the ELK server

You don't need to reformat the data as ES use json and a python dict is a json object.

In [None]:
es_endpoint = "https://MYDEPLOYMENT:9243"
basics=('USER','PASSWORD')

from datetime import datetime
from elasticsearch import Elasticsearch

def upload_document_to_es(data:dict) -> None:
    es = Elasticsearch(es_endpoint, basic_auth=basics)

    resp  = es.index(index="patents", id=data['PublicationNumber'], document=data)
    print(resp['result'])

    resp = es.get(index="patents", id=data['PublicationNumber'])
    print(resp['_source'])

    es.indices.refresh(index="patents")

    resp = es.search(index="patents", query={"match_all": {}})
    pp(resp)


This piece of code was used to parse all the file and send them to the ELK server.

In [None]:
# from glob import glob

# for file in  glob('resources/*.xml'):
#     data = parse_my_file(file)
#     upload_document_to_es(data)

An example for retrieving data from the database

In [None]:
es_endpoint = "https://MYDEPLOYMENT:9243"
basics=('USER','PASSWORD')

from elasticsearch import Elasticsearch

es = Elasticsearch(es_endpoint, basic_auth=basics)
resp = es.search(
    index="patents",
    query={"match_phrase": {"Inventor": """[GB]"""}},
    fields=["PublicationNumber", "Title", "Inventor"],
    source=False)
pp(resp.body)