In [1]:
import json, os, pandas as pd, numpy as np, csv
import requests
import io
import tarfile
import zipfile
from datasets import load_dataset

### Load data for Hallucination Detection training

In [2]:
# Directory where the CSV files are stored
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')

# List all CSV files in the directory
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]

# Load all CSV files into a single DataFrame
df_list = []
for file in csv_files:
    file_path = os.path.join(data_dir, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

# Concatenate all DataFrames
training_df = pd.concat(df_list, ignore_index=True)

# Display the first few rows of the combined DataFrame
training_df.head()

Unnamed: 0,id,grounding,generated_text,label,cut,dataset_origin
0,91198,Colin Kaepernick . Kaepernick began his profes...,Colin Kaepernick became a starting quarterback...,0,val,Fever
1,194462,Katherine Matilda `` Tilda '' Swinton ( born 5...,Tilda Swinton is a vegan.,0,val,Fever
2,137334,Soul Food is a 1997 American comedy-drama film...,Fox 2000 Pictures released the film Soul Food.,1,val,Fever
3,166626,"Anne Rice . Born in New Orleans , Rice spent m...",Anne Rice was born in New Jersey.,0,test,Fever
4,111897,Telemundo ( [ teleˈmundo ] ) is an American Sp...,Telemundo is a English-language television net...,0,val,Fever


In [3]:
# Get the counts of val and test data
val_test_spit = training_df['cut'].value_counts()

# Display the counts
print(val_test_spit)

print("val can be used for training the model and test can be used for evaluation the performance")

cut
val     84044
test    38332
Name: count, dtype: int64
val can be used for training the model and test can be used for evaluation the performance


In [4]:
# Get the counts by dataset origin
training_df['dataset_origin'].value_counts()

dataset_origin
Vitamin C     63054
HaluEval      20000
Fever         19998
PAWS           8000
XSumFaith      2353
SummEval       1698
FactCC         1434
FRANK          1393
Polytope       1268
Cao22           696
CLIFF           600
TofuEval        534
Wang20          474
samsum          250
qags_xsum       239
qags_cnndm      235
Goyal21         150
Name: count, dtype: int64

In [5]:
# prepare train and test - remove vitamin c & Fever as it is skewing the dataset towards Fact verification
# train lists 
train_data = training_df[(training_df.cut == 'val') & (~training_df['dataset_origin'].isin(['Vitamin C', 'Fever']))]
train_grounding_list = list(train_data['grounding'])
train_generated_list = list(train_data['generated_text'])

# test lists
test_data = training_df[(training_df.cut == 'test') & (~training_df['dataset_origin'].isin(['Vitamin C', 'Fever']))]
test_grounding_list = list(test_data['grounding'])
test_generated_list = list(test_data['generated_text'])

### LLM as judge baseline Approach

In [7]:
import torch
from typing import List, Tuple, Dict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os
from groq import Groq

class BatchLLMjudge:
    def __init__(self,
                 #nli_model_name: str = "roberta-large-mnli",
                 batch_size: int = 32,
                 kg_construction_prompt="""You are an expert at determining if a summary is consistent with a source article. Given an article and a summary, determine if all the information in the summary is supported by the article. Answer "yes" if the summary is consistent, and "no" if it is inconsistent.""",
                 ei_format_prompt="""Article: {article}
    Summary: {summary}
    Answer (yes or no):""", #Changed name of prompt
                 kg_tips_prompt="",  #Not needed for this task
                 kg_examples_prompt="", #Not needed for this task
                 llm_model: str = "llama-3.3-70b-versatile"):  #Using groq model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #self.tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
        #self.nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(self.device)
        self.batch_size = batch_size
        self.kg_construction_prompt = kg_construction_prompt
        self.ei_format_prompt = ei_format_prompt #Changed
        self.kg_tips_prompt = kg_tips_prompt
        self.kg_examples_prompt = kg_examples_prompt
        self.llm_model_name = llm_model  # Store LLM model name
        self.groq_client = Groq(api_key='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',)  #Groq client
        # No pipeline needed for groq api


    def construct_ei_batch(self, articles: List[str], summaries: List[str]) -> List[str]: #ei for entailment inference #Changed name and parameters
        # Builds prompts for entailment inference
        prompts = [self.kg_construction_prompt + "\n" + self.ei_format_prompt.format(article=article, summary=summary) for article, summary in zip(articles, summaries)]  #Changed to ei
        return prompts #Returning list of prompts


    def call_llm_to_extract_kg(self,prompt: str) -> str: #Changed name and return type
      # Wrap the LLM call in a try-except block
        try:
            #Call Groq API
            chat_completion = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.llm_model_name,
                max_tokens = 10, #Reduced tokens, just need a yes/no
            )
            output = chat_completion.choices[0].message.content
            return output  #Return the direct output, no parsing needed

        except Exception as e:
            print(f"Error calling LLM: {e}")
            return ""  #Return empty string in case of error

    def check_consistency_batch(self, grok_outputs: List[str]) -> List[int]:  #Now the grok outputs is what comes in

        results = []
        for output in grok_outputs:
          output = output.lower()
          if "yes" in output:
            results.append(1)  #Consistent
          elif "no" in output:
            results.append(0)  #Inconsistent
          else:
            results.append(-1)  #Undetermined, handle as needed.

        return results

    def evaluate_batch(self, articles: List[str], summaries: List[str]) -> List[int]: #Added labels
        prompts = self.construct_ei_batch(articles, summaries)
        #Process by batches
        all_grok_outputs = []
        for i in range(0, len(prompts), self.batch_size):
            batch_prompts = prompts[i:i + self.batch_size]
            batch_grok_outputs = [self.call_llm_to_extract_kg(prompt) for prompt in batch_prompts] #Call grok for each prompt
            all_grok_outputs.extend(batch_grok_outputs)


        #Evaluate the outputs
        predicted_labels = self.check_consistency_batch(all_grok_outputs) #Check the concistency
        
        #Calculate metrics (Accuracy, etc).
        #correct_predictions = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i]]) #Compare predictions with labels

        #accuracy = correct_predictions / len(labels) if len(labels) > 0 else 0 #Calculate accurancy
        #Create the results
        #results = {
        #    "accuracy": accuracy,
        #    "predicted_labels": predicted_labels,
        #    "actual_labels": labels
        #}
        return predicted_labels

# Usage example
# Ensure GROQ_API_KEY is set in your environment variables
# export GROQ_API_KEY="YOUR_GROQ_API_KEY"
batch_llm_eval = BatchLLMjudge(llm_model="llama-3.3-70b-versatile") #Specify Groq Model

In [None]:
#

###### running LLM as judge on sampled train and test data

In [8]:
# sample train and test data for llm as judge approach due to bottleneck on API call limit

# Randomly sample 20% of the rows
sampled_train_data = train_data.sample(frac=0.2, random_state=1)
sampled_test_data = test_data.sample(frac=0.2, random_state=1)

In [9]:
# check prompt generation
train_grounding_list = list(sampled_train_data['grounding'])
train_generated_list = list(sampled_train_data['generated_text'])
prompt_data = batch_llm_eval.construct_ei_batch(train_grounding_list, train_generated_list)

#### run aynchronous batch processing on Groq Api call

In [None]:
# write code for batch processing - https://console.groq.com/docs/batch
# 1. create json file
# 2. upload json file
# 3. create batch job
# 4. check batch status
# 5. reterive batch results

## Consider splitting very large workloads into multiple smaller batches (e.g. 1000 requests per batch) 
## for a better chance at completion rather than expiration for when we are under heavy load.

In [13]:
# sampled training data run
# Batch size
batch_size = 16


# Loop through DataFrame in batches
predictions = []
for start in range(0, len(sampled_train_data), batch_size):
    print(start)
    end = start + batch_size
    batch = sampled_train_data.iloc[start:end]
    train_grounding_list = list(batch['grounding'])
    train_generated_list = list(batch['generated_text'])
    result = batch_graph_eval.evaluate_batch(train_grounding_list, train_generated_list)
    predictions.extend(result)

# Store the results as a new column in the original DataFrame
sampled_train_data['prediction'] = predictions

0
16
32
48
64
80
96
112
128
144
160
176
192
208
224
240
256
272
288
304
320
336
352
368
384
400
416
432
448
464
480
Error calling LLM: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-1b-preview` in organization `org_01jpw0vz73e8arvgjkbgyppv3y` service tier `on_demand` on tokens per day (TPD): Limit 1000000, Used 1000024, Requested 2394. Please try again in 3m28.980999999s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error calling LLM: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-1b-preview` in organization `org_01jpw0vz73e8arvgjkbgyppv3y` service tier `on_demand` on tokens per day (TPD): Limit 1000000, Used 1000023, Requested 1039. Please try again in 1m31.808s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error calling LLM: Error c

Error calling LLM: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-1b-preview` in organization `org_01jpw0vz73e8arvgjkbgyppv3y` service tier `on_demand` on tokens per day (TPD): Limit 1000000, Used 1000002, Requested 954. Please try again in 1m22.6784s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error calling LLM: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-1b-preview` in organization `org_01jpw0vz73e8arvgjkbgyppv3y` service tier `on_demand` on tokens per day (TPD): Limit 1000000, Used 1000002, Requested 934. Please try again in 1m20.885399999s. Need more tokens? Visit https://groq.com/self-serve-support/ to request higher limits.', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error calling LLM: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.2-1b-preview` in organization `org_01jpw0vz73

KeyboardInterrupt: 

640

###### Training data results

In [None]:
labels = list(sampled_train_data['label'])
predicted_labels = list(sampled_train_data['prediction'])
correct_predictions = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i]]) 

tp = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 1]) 
fp = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 1]) 
tn = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 0]) 
fn = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 0]) 

print("TP :", tp)
print("FP :", fp)
print("TN :", tn)
print("FN :", fn)

print("Accuracy :" (tp+tn)/len(labels))
print("Precision :" (tp)/(tp+fp))
print("Recall :" (tp)/(tp+fn))

In [None]:
# Batch size
batch_size = 16


# Loop through DataFrame in batches
predictions = []
for start in range(0, len(sampled_test_data), batch_size):
    print(start)
    end = start + batch_size
    batch = sampled_test_data.iloc[start:end]
    train_grounding_list = list(batch['grounding'])
    train_generated_list = list(batch['generated_text'])
    result = batch_graph_eval.evaluate_batch(train_grounding_list, train_generated_list)
    predictions.extend(result)

# Store the results as a new column in the original DataFrame
sampled_test_data['prediction'] = predictions

###### Test data results

In [None]:
labels = list(sampled_test_data['label'])
predicted_labels = list(sampled_test_data['prediction'])
correct_predictions = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i]]) 

tp = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 1]) 
fp = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 1]) 
tn = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 0]) 
fn = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 0]) 

print("TP :", tp)
print("FP :", fp)
print("TN :", tn)
print("FN :", fn)

print("Accuracy :" (tp+tn)/len(labels))
print("Precision :" (tp)/(tp+fp))
print("Recall :" (tp)/(tp+fn))

### 1. GraphEval Implementation as Baseline Approach

GraphEval is a combination approach of using LLMs to create KGs and check consistency using NLI to detect hallucinations.

The implementation includes the main components of GraphEval as described in the paper:
1.KG construction from the LLM output
2.Consistency checking for each triple using an NLI model
3.Overall evaluation based on the consistency of all triples

Note that the KG construction step (construct_kg method) is a placeholder and should be implemented using an actual LLM in practice. The paper doesn't provide specific details on this step, so you would need to design an appropriate prompt and use an LLM API to generate the KG triples.

The check_consistency method uses a pre-trained RoBERTa model fine-tuned on MNLI for natural language inference. It returns the probability of contradiction between the triple and the context.

The evaluate method puts it all together, constructing the KG, checking each triple for consistency, and returning the overall result along with any inconsistent triples found.

In [100]:
import os
from groq import Groq

client = Groq(
    api_key='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',
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(chat_completion.choices[0].message.content)

Fast language models are crucial in today's technology landscape due to their numerous benefits and applications. Here are some reasons why fast language models are important:

1. **Improved User Experience**: Fast language models enable quick and accurate processing of natural language inputs, allowing users to interact with systems more efficiently. This leads to a better overall user experience, as users can receive responses and results rapidly.
2. **Real-time Applications**: Fast language models are essential for real-time applications, such as:
	* Chatbots and virtual assistants, which require rapid responses to user queries.
	* Sentiment analysis and opinion mining, where speed is critical for timely decision-making.
	* Language translation, where fast processing enables real-time communication across languages.
3. **Efficient Processing of Large Datasets**: Fast language models can handle massive amounts of text data, making them ideal for applications such as:
	* Text classifi

In [None]:
import torch
from typing import List, Tuple, Dict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
import os
from groq import Groq

class BatchGraphEval:
    def __init__(self,
                 nli_model_name: str = "roberta-large-mnli",
                 batch_size: int = 32,
                 kg_construction_prompt="""You are an expert at extracting information in structured formats to build a knowledge graph. 
    Step 1 − Entity detection: Identify all entities in the raw text. Make sure not to miss any out. Entities should be basic and simple, they are akin to Wikipedia nodes. 
    Step 2 − Coreference resolution: Find all expressions in the text that refer to the same entity. Make sure entities are not duplicated. 
    In particular do not include entities that are more specific versions themselves, e.g. "a detailed view of jupiter’s atmosphere" and "jupiter’s atmosphere", only include the most specific version of the entity. 
    Step 3 − Relation extraction: Identify semantic relationships between the entities you have identified.
    Format: Return the knowledge graph as a list of triples, i.e. [ "entity1", "relation1−2", "entity2"], in Python code.""",
                 kg_format_prompt="""Use the given format to extract information from the following input: <input>{input}</input>.
    Skip the preamble and output the result as a list within <python> tags.""",
                 kg_tips_prompt="""Important Tips:
    1. Make sure all information is included in the knowledge graph.
    2. Each triple must only contain three strings! None of the strings should be empty.
    3. Do not split up related information into separate triples because this could change the meaning.
    4. Make sure all brackets and quotation marks are matched.
    5. Before adding a triplet to the knowledge graph, check the concatenated triple makes sense as a sentence. If not, discard it.""",
                 kg_examples_prompt="""Here are some example input and output pairs.
    ## Example 1.
    Input: "The Walt Disney Company, commonly known as Disney, is an American multinational mass media and entertainment conglomerate that is headquartered at the Walt Disney Studios complex in Burbank, California."
    Output: [ [ "The Walt Disney Company", "headquartered at", "Walt Disney Studios complex in Burbank, California" ], [ "The Walt Disney Company", "commonly known as", "Disney" ], [ "The Walt Disney Company", "instance of", "American multinational mass media and entertainment conglomerate" ] ]
    ## Example 2.
    Input: "Amanda Jackson was born in Springfield, Ohio, USA on June 1, 1985. She was a basketball player for the U.S. women’s team."
    Output: [ [ "Amanda Jackson", "born in", "Springfield, Ohio, USA" ], [ "Amanda Jackson", "born on", "June 1, 1985" ], [ "Amanda Jackson", "occupation", "basketball player" ], [ "Amanda Jackson", "played for", "U.S. women’s basketball team" ] ]
    ## Example 3.
    Input: "Music executive Darius Van Arman was born in Pennsylvania. He attended Gonzaga College High School and is a human being."
    Output: [ [ "Darius Van Arman", "occupation", "Music executive" ], [ "Darius Van Arman", "born in", "Pennsylvania" ], [ "Darius Van Arman", "attended", "Gonzaga College High School" ], [ "Darius Van Arman", "instance of", "human being" ] ]
    ## Example 4.
    Input: "Italy had 3.6x times more cases of coronavirus than China."
    Output: [ [ "Italy", "had 3.6x times more cases of coronavirus than", "China" ] ]
    """,
                 llm_model: str = "llama-3.3-70b-versatile"):  #Using groq model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
        self.nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(self.device)
        self.batch_size = batch_size
        self.kg_construction_prompt = kg_construction_prompt
        self.kg_format_prompt = kg_format_prompt
        self.kg_tips_prompt = kg_tips_prompt
        self.kg_examples_prompt = kg_examples_prompt
        self.llm_model_name = llm_model  # Store LLM model name
        self.groq_client = Groq(api_key='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',)  #Groq client
        # No pipeline needed for groq api


    def construct_kg_batch(self, llm_outputs: List[str]) -> List[List[Tuple[str, str, str]]]:
        # Use the prompt with the LLM to construct KGs for multiple outputs
        batch_kgs = []
        for output in llm_outputs:
          input_text = f"{self.kg_construction_prompt} {self.kg_format_prompt.format(input=output)} {self.kg_tips_prompt} {self.kg_examples_prompt}"
          #print(input_text)
            #In practice, you would call an LLM API here with the combined prompt
            #and process the output to extract the KG triples.
            #Replace this with the actual LLM call
          triples = self.call_llm_to_extract_kg(input_text)
          batch_kgs.append(triples)

        return batch_kgs

    def call_llm_to_extract_kg(self,prompt: str) -> List[Tuple[str, str, str]]:
      # Wrap the LLM call in a try-except block
        try:
            #Call Groq API
            chat_completion = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.llm_model_name,
            )
            output = chat_completion.choices[0].message.content
            

            # Extract the knowledge graph from the output
            # Assumes the LLM returns the KG in a list within <python> tags
            start_tag = output.find('[')
            end_tag = output.rfind(']')
            if start_tag != -1 and end_tag != -1:
                kg_string = output[start_tag:end_tag+1]
                try:
                    kg = eval(kg_string) #use literal_eval for security
                    if isinstance(kg, list):
                        return kg
                    else:
                        print("LLM did not return a list.")
                        return []
                except (SyntaxError, NameError) as e:
                    print(f"Error parsing LLM output: {e}")
                    return []
            else:
                print("Could not find KG in LLM output.")
                return []
        except Exception as e:
            print(f"Error calling LLM: {e}")
            return []

    def check_consistency_batch(self, triples: List[Tuple[str, str, str]], contexts: List[str]) -> List[float]:
        # Combine the triples into sentences
        triple_texts = [f"{t[0]} {t[1]} {t[2]}" for t in triples]

        # Tokenize the inputs
        inputs = self.tokenizer(triple_texts, contexts, return_tensors="pt", truncation=True, max_length=512, padding=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Get the model predictions
        with torch.no_grad():
            outputs = self.nli_model(**inputs)
        probs = outputs.logits.softmax(dim=-1)

        # Return the probabilities of contradiction (index 2 in RoBERTa MNLI model)
        return probs[:, 2].tolist()

    def evaluate_batch(self, batch_kgs: List[List[Tuple[str, str, str]]], contexts: List[str]) -> List[int]:
        #batch_kgs = self.construct_kg_batch(llm_outputs)
        results = []

        for i in range(0, len(llm_outputs), self.batch_size):
            batch_llm_outputs = llm_outputs[i:i+self.batch_size]
            batch_contexts = contexts[i:i+self.batch_size]
            batch_kgs_subset = batch_kgs[i:i+self.batch_size]

            batch_triples = [triple for idx,kg in enumerate(batch_kgs_subset) for triple in kg]

            batch_contexts_expanded = []

            #Iterate over batch of kgs
            for batch_idx, kg in enumerate(batch_kgs_subset):

              #Extend context for each set of triples within a kg
              batch_contexts_expanded.extend([batch_contexts[batch_idx]] * len(kg))

            inconsistency_probs = self.check_consistency_batch(batch_triples, batch_contexts_expanded)

            triple_index = 0

            for batch_idx, kg in enumerate(batch_kgs_subset):

                inconsistent_triples = []

                for triple in kg:
                    inconsistency_prob = inconsistency_probs[triple_index]
                    if inconsistency_prob > 0.5:
                        inconsistent_triples.append((triple, inconsistency_prob))
                    triple_index += 1

                if len(inconsistent_triples) > 0:
                    result.append(0)
                else:
                    result.append(1)
                    1 else 0 end

        return results


In [104]:
batch_graph_eval = BatchGraphEval(llm_model="llama-3.3-70b-versatile")
train_llm_kgs = batch_graph_eval.construct_kg_batch(train_generated_list[0:1000])

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 5)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 8)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid 

Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid 

In [110]:
# In recursive way, complete the KGs
def process_arrays(arrays, index=0):
    if index >= len(arrays):
        return arrays

    if not arrays[index]:
        arrays[index] = batch_graph_eval.construct_kg_batch(train_generated_list[index:index+1])[0]

    return process_arrays(arrays, index + 1)


# Process the list of arrays
processed_train_llm_kgs = process_arrays(train_llm_kgs)


Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 5)
Error parsing LLM output: invalid syntax (<string>, line 8)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unmatched ')' (<string>, line 3)
Error parsing LLM output: invalid syntax (<string>, line 9)
Error parsing LLM output: invalid syntax (<string>, line 6)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unterminated string literal (detected at line 6) (<string>, line 6)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error p

In [112]:
# In recursive way, complete the KGs
# Process the list of arrays
re_processed_train_llm_kgs = process_arrays(processed_train_llm_kgs)


Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: unterminated string literal (detected at line 13) (<string>, line 13)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unterminated string literal (detected at line 6) (<string>, line 6)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: unexpected indent (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 8)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output

In [114]:
# In recursive way, complete the KGs
# Process the list of arrays
final_processed_train_llm_kgs = process_arrays(re_processed_train_llm_kgs)

Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 8)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: unterminated string literal (detected at line 6) (<string>, line 6)
Error parsing LLM output: invalid syntax (<string>, line 7)
Error parsing LLM output: unterminated string literal (detected at line 7) (<string>, line 7)
Error parsing LLM output: invalid syntax (<string>, line 9)
Error parsing LLM output: invalid syntax (<string>, line 4)
Error parsing LLM output: invalid syntax (<string>, line 7)


In [None]:
# training results ==> needs to be further fine-tuned
batch_graph_eval = BatchGraphEval(llm_model="llama-3.3-70b-versatile") #Specify Groq Model
train_results = batch_graph_eval.evaluate_batch(final_processed_train_llm_kgs, train_grounding_list[0:1000])

In [122]:
# Example Articles and Summaries (replace with your actual data)
articles = [
    "The Walt Disney Company, commonly known as Disney, is an American multinational mass media and entertainment conglomerate.",
    "Amanda Jackson was born in Springfield, Ohio, USA on June 1, 1985. She was a basketball player for the U.S. women’s team.",
    "Music executive Darius Van Arman was born in Pennsylvania. He attended Gonzaga College High School and is a human being.",
    "Italy had 3.6x times more cases of coronavirus than China."
]
summaries = [
    "Disney is a media conglomerate.",
    "Amanda Jackson was born in Ohio and played basketball.",
    "Darius Van Arman is a music executive born in Pennsylvania",
    "China had less coronavirus than Italy"
]

# Example Labels (1 for consistent, 0 for inconsistent)
labels = [1, 1, 1, 1]


results = batch_graph_eval.evaluate_batch(articles, summaries)

print(results)

[0, 0, 1, -1]


In [137]:
# Store the results as a new column in the original DataFrame
new_train_data = train_data.iloc[0:6112]
new_train_data['prediction'] = predictions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_data['prediction'] = predictions


In [138]:
new_train_data.loc[:, 'prediction'] = new_train_data['prediction'].replace(-1, 0)
labels = list(new_train_data['label'])
predicted_labels = list(new_train_data['prediction'])

predicted_labels = list(new_train_data['prediction'])
correct_predictions = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train_data['prediction'] = new_train_data['prediction'].replace(-1, 0)


In [139]:
new_train_data.head(10)

Unnamed: 0,id,grounding,generated_text,label,cut,dataset_origin,prediction
0,91198,Colin Kaepernick . Kaepernick began his profes...,Colin Kaepernick became a starting quarterback...,0,val,Fever,0
1,194462,Katherine Matilda `` Tilda '' Swinton ( born 5...,Tilda Swinton is a vegan.,0,val,Fever,0
2,137334,Soul Food is a 1997 American comedy-drama film...,Fox 2000 Pictures released the film Soul Food.,1,val,Fever,1
4,111897,Telemundo ( [ teleˈmundo ] ) is an American Sp...,Telemundo is a English-language television net...,0,val,Fever,0
6,181634,Mogadishu ( [ ˌmɔːɡəˈdiːʃuː ] Muqdisho [ mʉqdɪ...,There is a capital called Mogadishu.,1,val,Fever,0
7,219028,Savages (2012 film) . Savages is a 2012 Americ...,Savages was exclusively a German film.,0,val,Fever,1
9,108281,"Andrew Kevin Walker ( born August 14 , 1964 ) ...",Andrew Kevin Walker is only Chinese.,0,val,Fever,0
10,140846,Shooter (2007 film) . The film follows Force R...,Shooter is about an expert marksman who tries ...,0,val,Fever,1
13,54168,,Murda Beatz's real name is Marshall Mathers.,0,val,Fever,0
14,105095,"Carrie Anne Mathison , played by actress Clair...",Nicholas Brody is a character on Homeland.,1,val,Fever,0


In [143]:
# accuracy
correct_predictions/len(labels)

0.6277814136125655

In [147]:
tp = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 1]) 
fp = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 1]) 
tn = sum([1 for i in range(len(labels)) if labels[i] == predicted_labels[i] and predicted_labels[i] == 0]) 
fn = sum([1 for i in range(len(labels)) if labels[i] != predicted_labels[i] and predicted_labels[i] == 0]) 

In [148]:
precision = tp/(tp+fp)
print(precision)

0.40016433853738703


In [149]:
recall = tp/(tp+fn)
print(recall)

0.23966535433070865


In [7]:
new_train_data = train_data.iloc[0:6112]
labels = list(new_train_data['label'])

In [8]:
positive_label = sum([1 for i in range(len(labels)) if labels[i] == 1])

In [11]:
positive_label/len(labels)

0.3324607329842932