In [1]:
!pip install groq



In [2]:
import json
import random
from groq import Groq
import nltk
import re
from scipy.optimize import linear_sum_assignment
from nltk.translate.meteor_score import meteor_score
from nltk.tokenize import word_tokenize

# Ensure you have the necessary NLTK data files
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
def read_claims_from_json(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []
    except json.JSONDecodeError:
        print(f"Error decoding JSON from file: {file_path}")
        return []

# Load your dataset from JSON files
claims_file_1 = '/content/Modified_true_claims.json'
claims_file_2 = '/content/last_fake_claims.json'

claims_data_1 = read_claims_from_json(claims_file_1)
claims_data_2 = read_claims_from_json(claims_file_2)

# Combine claims from both files
combined_claims = claims_data_1 + claims_data_2
random.shuffle(combined_claims)

In [4]:
import re


# Initialize Groq API client
client = Groq(api_key="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")



def convert_to_json(llm_response):
    responseQuestions = []
    pattern = r'\[(.*?)\]'
    matches = re.findall(pattern, llm_response, re.DOTALL)

    if not matches:
        return responseQuestions  # Return an empty list if no matches are found

    json_str = matches[0]
    questions_llm = re.findall(r'\{(.*?)\}', json_str, re.DOTALL)

    for question in questions_llm:
        question_json = {}
        q = re.findall(r'.?question.?:\s*.?(.*),[\s\n]+.?questionType', question, re.DOTALL)
        if q:
            question_json['question'] = q[0].strip().strip('"')
        qt = re.findall(r'.?questionType.?:\s*.?(.*)[\'\"]', question, re.DOTALL)
        if qt:
            question_json['questionType'] = qt[0].strip().strip('"')
        responseQuestions.append(question_json)

    return responseQuestions


def get_response_llm(claim, questions):
  prompt = f"{questions} \n Claim: {claim}. Provide me only with similar questions relevant for the claim given in similar json format. Provide boolean, Abstractive, Extractive questions."

  try:
      response = client.chat.completions.create(
          messages=[
              {
                  "role": "user",
                  "content": prompt,
              }
          ],
          model="llama3-8b-8192",
          max_tokens=4000,
      )
      response = response.choices[0].message.content.strip()
      response_json = convert_to_json(response)
      return response_json
  except Exception as e:
      print(f"Error processing claim '{claim}': {e}, {response}")
      print("-*"*50)
      return ""

In [27]:
new_data = []

for data in combined_claims[:5008]:
  questions_prompt = []
  claim = data["claim"]
  for question in combined_claims[0]["questions"]:
    q = {"question": question["question"]}
    answer = question["answers"][0]
    #q['answer'] = answer['answer']
    q['questionType'] = answer['answer_type']
    questions_prompt.append(q)
  response = get_response_llm(claim, questions_prompt)
  data = {'claim': claim, 'generatedQuestions': response, 'goldQuestions': questions_prompt}
  new_data.append(data)

In [30]:
new_data

[{'claim': 'BJP workers distributing liquor in Telangana at the party’s meet where PM addressed over one lakh people.',
  'generatedQuestions': [{'question': "Were BJP workers seen distributing liquor during the party event in Telangana?'",
    'questionType': 'Boolean'},
   {'question': "What is the reported eyewitness account of the liquor distribution at the BJP rally in Telangana?'",
    'questionType': 'Abstractive'},
   {'question': "Did the PM address one lakh people at the BJP rally in Telangana, and was liquor distributed during the event?'",
    'questionType': 'Extractive'},
   {'question': "Were there any reports of liquor distribution among attendees at the BJP rally in Telangana?'",
    'questionType': 'Boolean'},
   {'question': "Does the claim mention the exact date and location of the liquor distribution incident at the BJP rally in Telangana?'",
    'questionType': 'Abstractive'},
   {'question': "What was the nature of the items being distributed by BJP workers at th

In [31]:
import numpy as np

In [37]:
hungarian_meteor_scores = []

def calculate_meteor_scores(generated_sequences, reference_sequences):
    """
    Calculate the METEOR scores between generated sequences and reference sequences.

    :param generated_sequences: List of generated sequences (list of strings)
    :param reference_sequences: List of reference sequences (list of strings)
    :return: A 2D list of METEOR scores where scores[i][j] is the score between generated_sequences[i] and reference_sequences[j]
    """
    scores = []
    for gen_seq in generated_sequences:
        row = []
        for ref_seq in reference_sequences:
            score = meteor_score([ref_seq], gen_seq)
            row.append(score)
        scores.append(row)
    return scores

def hungarian_meteor(generated_sequences, reference_sequences):
    """
    Apply the Hungarian Algorithm to find the optimal matching between generated sequences and reference sequences.

    :param generated_sequences: List of generated sequences (list of strings)
    :param reference_sequences: List of reference sequences (list of strings)
    :return: A list of tuples where each tuple contains the index of the generated sequence and the index of the reference sequence it is matched to
    """

    # Tokenize the sequences

    generated_sequences = [word_tokenize(seq) for seq in generated_sequences]
    reference_sequences = [word_tokenize(seq) for seq in reference_sequences]

    # Calculate the METEOR scores
    scores = calculate_meteor_scores(generated_sequences, reference_sequences)

    # Convert the scores to a cost matrix (negative scores because Hungarian Algorithm minimizes cost)
    cost_matrix = [[-score for score in row] for row in scores]

    # Apply the Hungarian Algorithm to find the optimal assignment
    try:
      row_ind, col_ind = linear_sum_assignment(cost_matrix)
    except ValueError as e:
      print(f"ravi,{generated_sequences},{reference_sequences},{scores}")
    # Calculate the total score using the Hungarian algorithm
    total_score = 0
    for i, j in zip(row_ind, col_ind):
        total_score += scores[i][j]

    # Normalize the total score by the number of reference sequences
    total_score = total_score / len(reference_sequences)

    # Best score for each generated question (max score across all references)
    best_scores = [max(row) for row in scores]

    return total_score, best_scores, list(zip(row_ind, col_ind))


for data in new_data:
  generated_sequences = [question['question'] for question in data['generatedQuestions'] if question.get("question")]
  reference_sequences = [question['question'] for question in data['goldQuestions']]
  if not generated_sequences or not reference_sequences:
      continue
  total_score, best_scores, optimal_matching = hungarian_meteor(generated_sequences, reference_sequences)

  # Print the best METEOR score for each generated question
  for i, score in enumerate(best_scores):
      print(f"Generated Question {i+1}: {''.join(generated_sequences[i])}")
      print(f"Best METEOR Score: {score:.4f}")

  # Print the total score (calculated from Hungarian matching)
  hungarian_meteor_scores.append(total_score)
  print(f"Total METEOR Score (normalized): {total_score:.4f}")

  # Optional: If you want to print the optimal matching from the Hungarian algorithm
  print("Optimal Matching from Hungarian Algorithm:")
  for i, j in optimal_matching:
      print(f"Generated Question {i+1} matched with Reference Question {j+1}")
  print("-" * 50)

# Calculate the average of all total METEOR scores
average_meteor_score = sum(hungarian_meteor_scores) / len(hungarian_meteor_scores) if hungarian_meteor_scores else 0
print(f"\nAverage METEOR Score: {average_meteor_score:.4f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Best METEOR Score: 0.3384
Generated Question 3: Did the incident occur during a specific event or rally in Amethi?'
Best METEOR Score: 0.3577
Generated Question 4: Is Amethi a location mentioned in the official investigation or police report of the incident?'
Best METEOR Score: 0.2976
Generated Question 5: Are there any eyewitness accounts or recordings that corroborate the claim of the former army captain being beaten to death in Amethi?'
Best METEOR Score: 0.2818
Generated Question 6: Is the former army captain a well-known public figure or has he been involved in any notable incidents before?'
Best METEOR Score: 0.1351
Generated Question 7: Did the authorities issue a statement or release a report confirming the death of the former army captain?'
Best METEOR Score: 0.2466
Total METEOR Score (normalized): 0.2797
Optimal Matching from Hungarian Algorithm:
Generated Question 2 matched with Reference Question 2
Generated Q