In [1]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def predict_NuExtract(model, tokenizer, texts, template, batch_size=1, max_length=10_000, max_new_tokens=4_000):
    template = json.dumps(json.loads(template), indent=4)
    prompts = [f"""<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>""" for text in texts]

    outputs = []
    with torch.no_grad():
        for i in range(0, len(prompts), batch_size):
            batch_prompts = prompts[i:i+batch_size]
            batch_encodings = tokenizer(batch_prompts, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(model.device)

            pred_ids = model.generate(**batch_encodings, max_new_tokens=max_new_tokens)
            outputs += tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    return [output.split("<|output|>")[1] for output in outputs]

model_name = "numind/NuExtract-1.5-smol"
device = "cuda"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True).to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

# Ejemplo Base

In [2]:
text = """We introduce Mistral 7B, a 7–billion-parameter language model engineered for
superior performance and efficiency. Mistral 7B outperforms the best open 13B
model (Llama 2) across all evaluated benchmarks, and the best released 34B
model (Llama 1) in reasoning, mathematics, and code generation. Our model
leverages grouped-query attention (GQA) for faster inference, coupled with sliding
window attention (SWA) to effectively handle sequences of arbitrary length with a
reduced inference cost. We also provide a model fine-tuned to follow instructions,
Mistral 7B – Instruct, that surpasses Llama 2 13B – chat model both on human and
automated benchmarks. Our models are released under the Apache 2.0 license.
Code: <https://github.com/mistralai/mistral-src>
Webpage: <https://mistral.ai/news/announcing-mistral-7b/>"""

template = """{
    "Model": {
        "Name": "",
        "Number of parameters": "",
        "Number of max token": "",
        "Architecture": []
    },
    "Usage": {
        "Use case": [],
        "Licence": ""
    }
}"""

prediction = predict_NuExtract(model, tokenizer, [text], template)[0]
print(prediction)

{
    "Model": {
        "Name": "Mistral 7B",
        "Number of parameters": "7\u00d7\u00d713B",
        "Number of max token": "",
        "Architecture": [
            "grouped-query attention (GQA)",
            "sliding window attention (SWA)"
        ]
    },
    "Usage": {
        "Use case": [
            "reasoning",
            "mathematics",
            "code generation"
        ],
        "Licence": "Apache 2.0"
    }
}
  


In [3]:
examples = [
    # Example 1: Complex Research Paper Abstract
    """
    We present OmegaNet-XL, a revolutionary 70-billion-parameter language model designed for computational linguistics and deep semantic analysis.
    OmegaNet-XL not only achieves state-of-the-art (SOTA) performance in over 25 linguistic benchmarks but also introduces an innovative Layered Dual Attention (LDA) mechanism that facilitates real-time adaptive learning.
    Through the integration of distributed low-rank adaptation (LoRA) and reversible normalization (RevNorm), OmegaNet-XL achieves unprecedented efficiency, cutting computational costs by 30% while maintaining high accuracy.
    The fine-tuned OmegaNet-XL – DialogueExpert variant outshines GPT-4 in long-form conversation tasks, specifically tailored for customer service applications.
    All OmegaNet-XL models are available under the Creative Commons Attribution 4.0 license.
    Research Portal: <https://research.omeganet.ai/omeganet-xl>

    Conference Paper: "OmegaNet-XL: Expanding the Frontiers of Large Language Models" presented at NeurIPS 2024.
    """,

    # Example 2: Technical Product Description
    """
    Introducing HydraAI Vision, a state-of-the-art computer vision framework designed for multi-modal data analysis. HydraAI Vision is capable of integrating image, video, and sensor data to create complex contextual insights.
    The model leverages dynamic feature aggregation (DFA) and parallel convolutional encoders (PCEs) to achieve high accuracy and real-time inference for applications in autonomous vehicles, smart surveillance, and advanced medical imaging.
    The 50B-parameter HydraAI Vision outperforms Vision Transformer (ViT) and YOLO v8 on various tests, including object detection and semantic segmentation.
    The framework supports training sequences up to 4096 tokens, thanks to optimized multi-head attention (OMHA).
    Licensed under MIT License.
    GitHub Repo: <https://github.com/hydraai/vision-framework>
    Documentation: <https://docs.hydraai.com/vision>
    """,

    # Example 3: Multi-Disciplinary Report
    """
    The Atlas Research Foundation has unveiled Novum Alpha 24B, a transformative model tailored for multi-disciplinary research encompassing economics, environmental science, and policy-making.
    Novum Alpha 24B's core innovation lies in its cross-field embedding generator (CFEG), enabling unparalleled contextual correlation across distinct academic disciplines.
    The model exhibits enhanced capacity in predictive economic modeling, outperforming conventional models such as DeepMacro 12B.
    The 24B model can handle up to 8192 tokens per input, offering robust analysis for long-form documents and research papers.
    Released under an open-source BSD-3 license.
    Full Report: <https://atlasresearch.org/novum-alpha24>
    Dataset: The Novum Global DataBank (NGD) v1.3.
    """,

    # Example 4: Legal Case Summary
    """
    Case Summary: In the precedent-setting case "TechGen v. CompSoft", the 2023 ruling by the United States Court of Appeals for the Federal Circuit established new guidelines for the interpretation of software patents involving
    machine learning models. The case revolved around the 17B-parameter ML model named "NeuralShield", developed by TechGen, which CompSoft alleged had infringed on proprietary algorithms protected under the Digital Patent Act
    (DPA). The court ruled in favor of TechGen, citing that the open structure of NeuralShield allowed for lawful adaptation under the principles of fair use.
    The ruling outlined the parameters for permissible adaptation, emphasizing the importance of transparency in ML model design.
    Verdict Date: December 15, 2023
    Reference Document: Federal Circuit Journal Vol. 52, 2024, pp. 112-145.
    """,

    # Example 5: Scientific Study Overview
    """
    The Proteomics Institute has introduced EnzymeGPT-9, a 9-billion-parameter model engineered to enhance biochemical research, with a focus on enzyme-substrate interaction predictions.
    EnzymeGPT-9 employs a hybrid recurrent-transformer architecture to simulate complex binding processes.
    Preliminary tests indicate a 45% reduction in prediction error rates compared to legacy enzyme models such as SubstratumNet-5.
    The model's adaptive token compression (ATC) feature allows for efficient analysis of protein sequences up to 2048 amino acids in length.
    EnzymeGPT-9 and its associated datasets are licensed under the Academic Free License 3.0.
    Preprint: "Analyzing Enzymatic Pathways Using Large Language Models" available on BioRxiv.
    Project Page: <https://proteomics-institute.org/enzymegpt9>
    """
]

# Example-specific templates for information extraction

templates = [
    # Template for Example 1
    """{
        "Model": {
            "Name": "",
            "Number of parameters": "",
            "Benchmarks": [],
            "Innovative features": [],
            "Efficiency improvements": "",
            "Variants": []
        },
        "Licensing and Publication": {
            "Licence": "",
            "Research Portal": "",
            "Conference Paper": ""
        }
    }""",

    # Template for Example 2
    """{
        "Framework": {
            "Name": "",
            "Parameter size": "",
            "Supported modalities": [],
            "Key features": []
        },
        "Performance": {
            "Comparison models": [],
            "Max sequence length": ""
        },
        "Documentation": {
            "License": "",
            "GitHub Repo": "",
            "Documentation Page": ""
        }
    }""",

    # Template for Example 3
    """{
        "Model": {
            "Name": "",
            "Parameter count": "",
            "Core innovation": "",
            "Disciplines covered": []
        },
        "Performance": {
            "Comparative models": [],
            "Max input tokens": ""
        },
        "Resources": {
            "License": "",
            "Report URL": "",
            "Datasets used": []
        }
    }""",

    # Template for Example 4
    """{
        "Case": {
            "Name": "",
            "Year": "",
            "Court": "",
            "Key model involved": "",
            "Legislation referenced": []
        },
        "Outcome": {
            "Ruling": "",
            "Reasoning highlights": []
        },
        "Documentation": {
            "Verdict Date": "",
            "Reference Document": ""
        }
    }""",

    # Template for Example 5
    """{
        "Model": {
            "Name": "",
            "Number of parameters": "",
            "Purpose": "",
            "Architecture": []
        },
        "Performance": {
            "Improvement metrics": "",
            "Token handling": ""
        },
        "Resources": {
            "License": "",
            "Preprint URL": "",
            "Project Page": ""
        }
    }"""
]

# Run predictions for each example
for example_text, template in zip(examples, templates):
    prediction = predict_NuExtract(model, tokenizer, [example_text], template)[0]
    print(prediction)


{
    "Model": {
        "Name": "OmegaNet-XL",
        "Number of parameters": "70-billion-parameter",
        "Benchmarks": [
            "over 25 linguistic benchmarks"
        ],
        "Innovative features": [
            "Layered Dual Attention (LDA)",
            "distributed low-rank adaptation (LoRA)",
            "reversible normalization (RevNorm)"
        ],
        "Efficiency improvements": "cutting computational costs by 30%",
        "Variants": [
            "DialogueExpert"
        ]
    },
    "Licensing and Publication": {
        "Licence": "Creative Commons Attribution 4.0",
        "Research Portal": "https://research.omeganet.ai/omeganet-xl",
        "Conference Paper": "OmegaNet-XL: Expanding the Frontiers of Large Language Models"
    }
}
  
{
    "Framework": {
        "Name": "HydraAI Vision",
        "Parameter size": "50B",
        "Supported modalities": [
            "image",
            "video",
            "sensor"
        ],
        "Key features": [

In [6]:
text = """I need an oil change"""

template = """{
    "Car Service Appointment": {
        "Service Type": ""
    }
}"""

prediction = predict_NuExtract(model, tokenizer, [text], template)[0]
print(prediction)

{
    "Car Service Appointment": {
        "Service Type": "oil change"
    }
}
  


In [8]:
text = """Mi name is David Usta and my phone number is 990456799"""
text = """Mi name is David Usta and my phone number is """

template = """{
        "Personal Information": {
            "First Name": "",
            "Last Name": "",
            "Phone Number": ""
        }
  }"""

prediction = predict_NuExtract(model, tokenizer, [text], template)[0]
print(prediction)

{
    "Personal Information": {
        "First Name": "David",
        "Last Name": "Usta",
        "Phone Number": ""
    }
}
  


In [18]:
text = """what about tommorrow sunday or next monday at 9am"""
#text = """what about tommorrow sunda or next monday at 9am"""

template = """{
    "Appointment": {
        "Day of the Week": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
        "Date (ISO format)": "",
        "Year": "",
        "Month": "",
        "Day": "",
        "Hour":"",
        "Minute":"",
        "Time of Day": ["Morning","Afternoon","Night"]
    }
}"""

prediction = predict_NuExtract(model, tokenizer, [text], template)[0]
print(prediction)

{
    "Appointment": {
        "Day of the Week": [
            "Sunday",
            "Monday"
        ],
        "Date (ISO format)": "",
        "Year": "",
        "Month": "",
        "Day": "",
        "Hour": "9am",
        "Minute": "",
        "Time of Day": [
            "Morning",
            "Afternoon",
            "Night"
        ]
    }
}
  


https://numind.ai/blog/nuextract-1-5---multilingual-infinite-context-still-small-and-better-than-gpt-4o


https://huggingface.co/numind/NuExtract-1.5-smol