In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import logging
import pandas as pd

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")

# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def generate_system_message() -> str:
    return """
    You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
    Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES]. Make the nodes and relationship format a Python-compatible string.
    It is important that the ENTITY_ID_1 and ENTITY_ID_2 exist as nodes with a matching ENTITY_ID. If you can't pair a relationship with a pair of nodes, don't add it.
    When you find a node or relationship you want to add, try to create a generic TYPE for it that describes the entity. You can also think of it as a label.
    The entity may include person, company, event, etc.

    Example:
    Data: Alice is a lawyer and is 25 years old and Bob is her roommate since 2001. Bob works as a journalist. Alice owns a webpage www.alice.com and Bob owns the webpage www.bob.com.
    Nodes: [["alice", "Person", {"age": 25, "occupation": "lawyer", "name":"Alice"}], ["bob", "Person", {"occupation": "journalist", "name": "Bob"}], ["alice.com", "Webpage", {"url": "www.alice.com"}], ["bob.com", "Webpage", {"url": "www.bob.com"}]]
    Relationships: [["alice", "roommate", "bob", {"start": 2021}], ["alice", "owns", "alice.com", {}], ["bob", "owns", "bob.com", {}]]
    
    Data: The company's history goes back to a small ecommerce shop. There are conflicting reports on SHEIN's origins.
    Founded in 2012, the company's history goes back to a small ecommerce shop. It was launched by Chris Xu (CEO), an entrepreneur, and his ex-colleague, Wang Xiaohu, named Nanjing Dianwei Information Technology (NDIT) in 2008.
    Lily Peng, a part-time consultant, and known parent who is described as a "hardworking SEO whiz," is also known about the entrepreneur. Some reports describe Xu focused on technical parts while leaving business development, finance, and corporate functions to Xiaohu and Peng.
    Xu, who studied at George Washington University, is described by some sources as a Chinese-American.'
    Nodes: [["chris_xu", "Person", {"name": "Chris Xu"}], ["wang_xiaohu", "Person", {"name": "Wang Xiaohu"}], ["SHEIN", "Company", {"name": "SHEIN"}], ["lily_peng", "Person", {"name": "Lily Peng"}], ["tidn", "Company", {"name": "TIDN"}], ["nanjing_information_technology", "Company", {"name": "Nanjing Information Technology"}]]
    Relationships: [["chris_xu", "cofounded_with", "wang_xiaohu", {}], ["chris_xu", "founded", "SHEIN", {}], ["Chris Xu", "focused on", "technical parts ", {}], ["lily_peng", "focused on", "business and finance part", {}]]
    """

def extract_entities_and_relationships(paragraph):
    system_message = generate_system_message()
    prompt = f"{system_message}\n\nData: {paragraph}"
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    with torch.no_grad():
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=1024, num_beams=5, early_stopping=True, pad_token_id=tokenizer.eos_token_id)
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Example paragraph input
paragraph = "The quick brown fox jumps over the lazy dog. The dog did not see the fox coming."

# Extract entities and relationships
entities_relationships = extract_entities_and_relationships(paragraph)
print("Extracted Entities and Relationships:\n", entities_relationships)

def get_attention_values(paragraph):
    # Tokenize the input paragraph
    inputs = tokenizer(paragraph, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, output_attentions=True)

    # Extract attentions
    attentions = outputs.attentions  # A tuple containing attention values for all layers
    return attentions, inputs

# Run the test case to get attention values
attentions, inputs = get_attention_values(paragraph)

# Specify the layer and head index
layer_index = -1  # Last layer
head_index = 0    # First attention head

# Attention values shape: (batch_size, num_heads, sequence_length, sequence_length)
selected_attention = attentions[layer_index][0][head_index].detach().cpu().numpy()

# Get token labels and remove special token markers
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
tokens = [token.replace('Ġ', '') for token in tokens]

# Predefined extracted entities
entities = ["fox", "dog", "quick", "brown", "lazy", "jumps", "over", "the"]

# Create a DataFrame with entities and qv values
data = []
seq_len = len(tokens)
for i in range(seq_len):
    for j in range(seq_len):
        if tokens[i] in entities and tokens[j] in entities:
            entity_1 = tokens[i]
            entity_2 = tokens[j]
            qv_value = selected_attention[i][j]
            data.append([entity_1, entity_2, qv_value])

# Convert the data to a DataFrame
df = pd.DataFrame(data, columns=['Entity1', 'Entity2', 'QV_Value'])

# Print the DataFrame
print(df)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Extracted Entities and Relationships:
 
    You are a data scientist working for a company that is building a graph database. Your task is to extract information from data and convert it into a graph database.
    Provide a set of Nodes in the form [ENTITY_ID, TYPE, PROPERTIES] and a set of relationships in the form [ENTITY_ID_1, RELATIONSHIP, ENTITY_ID_2, PROPERTIES]. Make the nodes and relationship format a Python-compatible string.
    It is important that the ENTITY_ID_1 and ENTITY_ID_2 exist as nodes with a matching ENTITY_ID. If you can't pair a relationship with a pair of nodes, don't add it.
    When you find a node or relationship you want to add, try to create a generic TYPE for it that describes the entity. You can also think of it as a label.
    The entity may include person, company, event, etc.

    Example:
    Data: Alice is a lawyer and is 25 years old and Bob is her roommate since 2001. Bob works as a journalist. Alice owns a webpage www.alice.com and Bob owns the we

In [2]:
# Predefined extracted entities
relationships = [
    ["fox", "jumps_over", "dog", {}],
    ["fox", "is", "quick", {}],
    ["fox", "is", "brown", {}],
    ["dog", "is", "lazy", {}]
]
entity1 = []
entity2 = []
for rel in relationships:
    entity1.append(rel[0])
    entity2.append(rel[2])



In [3]:
# Create a list to store the results
results = []

# Extract QV values for each entity pair in relationships
for ent1, ent2 in zip(entity1, entity2):
    if ent1 in tokens and ent2 in tokens:
        ent1_index = tokens.index(ent1)
        ent2_index = tokens.index(ent2)
        qv_value = selected_attention[ent1_index][ent2_index]
        results.append([ent1, ent2, qv_value])

# Convert the results to a DataFrame
df = pd.DataFrame(results, columns=['Entity1', 'Entity2', 'QV_Value'])

# Print the DataFrame
print(df)

  Entity1 Entity2  QV_Value
0     fox     dog  0.000000
1     fox   quick  0.025327
2     fox   brown  0.071750
3     dog    lazy  0.045116


In [4]:
# Specify the layer and head index
layer_index = 16  # middle layer 
head_index = 0    # First attention head

# Attention values shape: (batch_size, num_heads, sequence_length, sequence_length)
selected_attention = attentions[layer_index][0][head_index].detach().cpu().numpy()


In [5]:
# Create a list to store the results
results = []

# Extract QV values for each entity pair in relationships
for ent1, ent2 in zip(entity1, entity2):
    if ent1 in tokens and ent2 in tokens:
        ent1_index = tokens.index(ent1)
        ent2_index = tokens.index(ent2)
        qv_value = selected_attention[ent1_index][ent2_index]
        results.append([ent1, ent2, qv_value])

# Convert the results to a DataFrame
df = pd.DataFrame(results, columns=['Entity1', 'Entity2', 'QV_Value'])

# Print the DataFrame
print(df)

  Entity1 Entity2  QV_Value
0     fox     dog  0.000000
1     fox   quick  0.001653
2     fox   brown  0.003033
3     dog    lazy  0.000813


### Fine Tune LLM

In [9]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

INFO:datasets:PyTorch version 2.3.0 available.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    compute_metrics=compute_metrics,
)