In [13]:
from typing import List, Union, Dict, Any
import requests
from termcolor import cprint
from transformers import PreTrainedTokenizer
import json

GENERATION_MARKER = "|GENERATION|"


class Jsonformer:
    value: Dict[str, Any] = {}

    def __init__(
        self,
        API_URL:str,
        tokenizer: PreTrainedTokenizer,
        json_schema: Dict[str, Any],
        prompt: str,
        *,
        debug: bool = False,
        max_array_length: int = 10,
        max_number_tokens: int = 6,
        temperature: float = 1.0,
        max_string_token_length: int = 15,
    ):
        self.tokenizer = tokenizer
        self.json_schema = json_schema
        self.prompt = prompt
        self.API_URL = API_URL

        
        self.generation_marker = "|GENERATION|"
        self.debug_on = debug
        self.max_array_length = max_array_length

        self.max_number_tokens = max_number_tokens
        self.temperature = temperature
        self.max_string_token_length = max_string_token_length

    def debug(self, caller: str, value: str, is_prompt: bool = False):
        if self.debug_on:
            if is_prompt:
                cprint(caller, "green", end=" ")
                cprint(value, "yellow")
            else:
                cprint(caller, "green", end=" ")
                cprint(value, "blue")
 
    def generate_string(self) -> str:
        prompt = self.get_prompt() + '"'
        self.debug("[generate_string]", prompt, is_prompt=True)
        
        # API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
        headers = {"Authorization": "Bearer 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"}

        def query(payload):
            response = requests.post(self.API_URL, headers=headers, json=payload)
            return response.json()
          
        response = query({
          "inputs": prompt,
          "parameters":{
              "max_new_tokens":self.max_string_token_length,
              "num_return_sequences":1,
              "temperature":self.temperature,
              "return_full_text":False
          }
        })
        print(response)
        if response.count('"') < 1:
            return response[0]['generated_text'].split('"')[0]
        
        return response.split('"')[0].strip()

    def generate_object(
        self, properties: Dict[str, Any], obj: Dict[str, Any]
    ) -> Dict[str, Any]:
        for key, schema in properties.items():
            self.debug("[generate_object] generating value for", key)
            obj[key] = self.generate_value(schema, obj, key)
        return obj

    def generate_value(
        self,
        schema: Dict[str, Any],
        obj: Union[Dict[str, Any], List[Any]],
        key: Union[str, None] = None,
    ) -> Any:
        schema_type = schema["type"]
        if schema_type == "string":
            if key:
                obj[key] = self.generation_marker
            else:
                obj.append(self.generation_marker)
            return self.generate_string()
        elif schema_type == "object":
            new_obj = {}
            if key:
                obj[key] = new_obj
            else:
                obj.append(new_obj)
            return self.generate_object(schema["properties"], new_obj)
        else:
            raise ValueError(f"Unsupported schema type: {schema_type}")


    def get_prompt(self):
        template = """{prompt}\nOutput result in the following JSON schema format:\n{schema}\nResult: {progress}"""
        progress = json.dumps(self.value)
        gen_marker_index = progress.find(f'"{self.generation_marker}"')
        if gen_marker_index != -1:
            progress = progress[:gen_marker_index]
        else:
            raise ValueError("Failed to find generation marker")

        prompt = template.format(
            prompt=self.prompt,
            schema=json.dumps(self.json_schema),
            progress=progress,
        )

        return prompt

    def __call__(self) -> Dict[str, Any]:
        self.value = {}
        generated_data = self.generate_object(
            self.json_schema["properties"], self.value
        )
        return generated_data

In [14]:
json_schema = {
    "properties": {
        "DATE": {
            "type": "string",
            "description": "The date when the email was forwarded to the scraper"
        },
        "SERVICE": {
            "type": "string",
            "description": "Description of the service mentioned in the email"
        },
        "TYPE": {
            "type": "string",
            "description": "Type of document or release, e.g., 'Media Release'"
        },
        "UNIT": {
            "type": "string",
            "description": "Unit within the organization, e.g., 'Medicine'"
        },
        "DEPARTMENT": {
            "type": "string",
            "description": "Specific department involved, if applicable"
        },
        "LEAD_EXPERT": {
            "type": "string",
            "description": "Lead expert or researcher mentioned in the email"
        },
        "CAMPAIGN": {
            "type": "string",
            "description": "Campaign associated with the content, e.g., 'Teaching, Learning and Research'"
        },
        "COMPLEXITY": {
            "type": "string",
            "description": "Always ouput 3"
        },
        "DIVERSITY": {
            "type": "string",
            "description": "Diversity information, if any."
        },
        "TEAM MEMBER": {
            "type": "string",
            "description": "Person forwarding the email or people mentioned, dynamically tagged to the sender, e.g., 'Kurt Heinrich'."
        },
        "KEY MESSAGING": {
            "type": "string",
            "description": "Key messaging included in the email, can be left empty."
        },
    }
}

In [15]:
# get donload_loader
from llama_index.core import download_loader
# Create a download loader
from llama_index.readers.file import UnstructuredReader

# Initialize the UnstructuredReader
loader = UnstructuredReader()
msg_documents = loader.load_data("data/safe_on_cloud/jj_email_1.eml")
msg_content = msg_documents[0].text
print(msg_content)

Hello Kurt and Andrea,

We have an upcoming paper about to be published that will be something the media might be interested in. I've worked with you both before, so thought I'd reach out re next steps, but please let me know if there is someone else I should be contacting.

The paper will be published in

The work is on predicting the survival of cancer patients using a branch of artificial intelligence called natural language processing, the same stuff that everyone now knows about thanks to ChatGPT. We use the AI (language models) to predict how long a cancer patient will survive based on the first consultation report their medical oncologists writes after their initial appointment. This is a particularly exciting technique because all cancer patients would have this document, so this prediction is accessible. Our work was able to predict survival quite accurately, similar to prior work that has had to use fancy stuff like tumour genetic markers to make these predictions.

I thought

In [18]:
from huggingface_hub import login
config = {
    "huggingface_api": "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"
}
login(config['huggingface_api'])
from transformers import AutoTokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
prompt = "This is the email content:"+msg_content+ "Generate information based on the following schema:"
#prompt = "Generate information based on the following schema:"

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/pengyuyang/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
jsonformer = Jsonformer(API_URL,tokenizer, json_schema, prompt)
generated_data = jsonformer()
json_string = json.dumps(generated_data, indent=4)

[{'generated_text': '2023-02-27", "SERVICE": "Predicting cancer patient'}]
1
2023-02-27
[{'generated_text': 'Predicting cancer patient survival using NLP", "TYPE": "Paper",'}]
1
Predicting cancer patient survival using NLP
[{'generated_text': 'Paper", "UNIT": "Medicine", "DEPARTMENT": "M'}]
1
Paper
[{'generated_text': 'Medicine", "DEPARTMENT": "Mood Disorders Centre and BC Cancer'}]
1
Medicine
[{'generated_text': 'Mood Disorders Centre and BC Cancer", "LEAD_EXPERT": "'}]
1
Mood Disorders Centre and BC Cancer
[{'generated_text': 'John-Jose Nuñez", "CAMPAIGN": "", "COM'}]
1
John-Jose Nuñez
[{'generated_text': 'Teaching, Learning and Research", "COMPLEXITY": "3",'}]
1
Teaching, Learning and Research


SSLError: HTTPSConnectionPool(host='api-inference.huggingface.co', port=443): Max retries exceeded with url: /models/meta-llama/Meta-Llama-3-8B-Instruct (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1007)')))

In [83]:
with open("ts.json", "w") as json_file:
    json_file.write(json_string)