In [None]:
# Clone the BARTScore repository
!git clone https://github.com/neulab/BARTScore.git

In [None]:
%cd /content/BARTScore

In [None]:
# Import libraries
from bart_score import BARTScorer
import torch

In [None]:
# Install necessary libraries
!pip install --upgrade pip  # ensures that pip is current
!pip install tensorflow tensorflow_hub transformers
!pip install git+https://github.com/google-research/bleurt.git
!pip install pandas
!pip install groq
!pip install scikit-learn
!pip install matplotlib
!pip install nltk
!pip install rouge
!pip install sacrebleu
!pip install bert-score
!pip install rouge
# Navigate to the repository directory
%cd /content/BARTScore

# Download and unzip BLEURT checkpoint
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip
!unzip BLEURT-20.zip

# Initialize BART-Scorer
bart_scorer = BARTScorer(device='cuda:0' if torch.cuda.is_available() else 'cpu',
                         checkpoint='facebook/bart-large-cnn')

!pip install groq
!pip install sentence-transformers
!pip install chromadb
!pip install pymongo pandas
!pip install transformers torch accelerate bitsandbytes
!pip install --upgrade transformers
!pip install rouge
!pip install google-search-results
!pip install requests beautifulsoup4 lxml

In [None]:
import traceback
import requests
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity
import random
import nltk
from groq import Groq
import pandas as pd
import signal
import time
import re
from sentence_transformers import SentenceTransformer

import tensorflow as tf
import tensorflow_hub as hub
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sacrebleu.metrics import CHRF, TER
from bert_score import score
from bleurt import score as bleurt_score
import json
import random
import warnings
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
import logging
import pandas as pd
import random
import requests
from bs4 import BeautifulSoup
import re
import nltk
import signal
import time
import traceback
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Suppress specific warning messages
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Function to read and combine datasets
def load_and_combine_datasets(file_paths):
    combined_claims = []
    for file_path in file_paths:
        try:
            data = pd.read_json(file_path)
            combined_claims.extend(data.to_dict(orient='records'))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    random.shuffle(combined_claims)  # Shuffle claims for randomness
    return pd.DataFrame(combined_claims)

# File paths for datasets
file_paths = ["/content/last_fake_claims.json", "/content/Modified_true_claims.json"]

# Load and combine datasets
df = load_and_combine_datasets(file_paths)

# Initialize the Groq API client with the first API key
api_key = "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"
client = Groq(api_key=api_key)

# Label map
label_map = {'true': 'true', 'fake': 'fake', "correct": "true", "false": "fake", "incorrect": "fake", "not true": "fake", "Answer: FAKE" : "fake", "Answer: TRUE" : "true", }

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

# Set the timeout handler
signal.signal(signal.SIGALRM, timeout_handler)

def get_claim_predictions(claim, contextQuestions, timeout=120, retries=3):
    attempt = 0
    while attempt < retries:
        try:
            # Set the alarm for the timeout
            signal.alarm(timeout)

            prompt = f"Label the following claim as 'true' or 'fake' from the given context. Answer with either 'TRUE' or 'FAKE' only in the first line. Also add another line for justification from the given context only. Context: {contextQuestions} . Claim:{claim}"
            print(f"Input token length: {len(prompt.split())}")

            # Call the model with the prompt
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="gemma2-9b-it",
                max_tokens=8000,
            )

            # Get the classification result from the model
            predicted_result = chat_completion.choices[0].message.content.splitlines()
            print(predicted_result)
            predicted_label = re.sub(r'[^a-zA-Z]+', '', predicted_result[0]).lower()
            predicted_label = label_map.get(predicted_label, "unknown")
            predicted_justification = " ".join(predicted_result[1:]).strip().lower()

            # Cancel the alarm if the function completes in time
            signal.alarm(0)

            return (prompt, predicted_label, predicted_justification)

        except TimeoutException:
            print(f"Timeout occurred for claim: {claim}, attempt {attempt + 1}")
            attempt += 1
            if attempt < retries:
                print("Retrying after 2 minutes...")
                time.sleep(30)  # Wait for 2 minutes before retrying
        except Exception as e:
            print(f"Error processing claim '{claim}': {e}")
            traceback.print_exc()
            return ("", "", "")

    return ("", "", "")

# Initialize lists for results
results = []
actual_labels = []
predicted_labels = []
predicted_justifications = []
actual_justifications = []

# Initialize counters for tracking model performance
correct_predictions = 0
n_claims = 5006  # Adjust this to the desired number of claims to process
total_processed_claims = 0

# Initialize label map
label_map = {'true': 'true', 'fake': 'fake', "correct": "true", "false": "fake", "incorrect": "fake", "not true": "fake"}

for index, row in df.head(n_claims).iterrows():
    claim = row['claim']
    actual_label = str(row['label'].strip().lower())
    actual_justification = str(row['justification'].strip().lower())
    contextQuestions = ""
    questions = row['questions']
    for i, q in enumerate(questions):
        contextQuestions += str(i+1) + q["question"] + "\nAnswers:" + str(q["answers"])

    prompt, predicted_label, predicted_justification = get_claim_predictions(claim, contextQuestions)

    total_processed_claims += 1

    if (predicted_label, predicted_justification) == ("", ""):
        continue

    actual_labels.append(actual_label)
    actual_justifications.append(actual_justification)
    predicted_labels.append(predicted_label)
    predicted_justifications.append(predicted_justification)

    if predicted_label == actual_label:
        correct_predictions += 1

    print(f"Claim #{total_processed_claims}")
    print(f"Prompt: {prompt}")
    print(f"Claim: {claim}")
    print(f"Predicted Label: {predicted_label}")
    print(f"Actual Label: {actual_label}")
    print(f"Predicted Justification: {predicted_justification}")
    print(f"Actual Justification: {actual_justification}")
    print('-' * 50)

# Calculate the overall accuracy of the model
accuracy = (correct_predictions / total_processed_claims) * 100 if total_processed_claims > 0 else 0
print(f"Model Accuracy: {accuracy:.2f}%")

# Calculate the confusion matrix
cm = confusion_matrix(actual_labels, predicted_labels, labels=["true", "fake"])

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["true", "fake"])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Metrics initialization
rouge = Rouge()
smoothing_function = SmoothingFunction().method4
chrf = CHRF()
ter = TER()
bleurt_scorer = bleurt_score.BleurtScorer("BLEURT-20")

# Filter out empty justifications
filtered_justifications = [
    (actual, predicted)
    for actual, predicted in zip(actual_justifications, predicted_justifications)
    if predicted.strip()
]

# Calculate metrics
def calculate_metrics_extended(justifications):
    bleu_scores = []
    rouge_scores = []
    meteor_scores = []
    chrf_scores = []
    ter_scores = []
    bert_scores = []
    bleurt_scores = []
    bart_scores = []

    for ref, pred in justifications:
        # BLEU score
        bleu = sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothing_function)
        bleu_scores.append(bleu)

        # ROUGE score
        rouge_score = rouge.get_scores(pred, ref, avg=True)
        rouge_scores.append(rouge_score)

        # METEOR score
        meteor = meteor_score([ref.split()], pred.split())
        meteor_scores.append(meteor)

        # ChrF
        chrf_scores.append(chrf.corpus_score([pred], [[ref]]).score)

        # TER
        ter_scores.append(ter.corpus_score([pred], [[ref]]).score)

        # BERTScore
        P, R, F1 = score([pred], [ref], lang="en", device=device)
        bert_scores.append(F1.mean().item())

        # BLEURT
        bleurt_scores.append(bleurt_scorer.score(references=[ref], candidates=[pred])[0])

        # BARTScore
        bart_scores.append(bart_scorer.score([pred], [ref])[0])

    return bleu_scores, rouge_scores, meteor_scores, chrf_scores, ter_scores, bert_scores, bleurt_scores, bart_scores

# Calculate metrics
bleu_scores, rouge_scores, meteor_scores, chrf_scores, ter_scores, bert_scores, bleurt_scores, bart_scores = calculate_metrics_extended(filtered_justifications)

# Summarize and print results
avg_rouge_1 = sum([score['rouge-1']['f'] for score in rouge_scores]) / len(rouge_scores)
avg_rouge_2 = sum([score['rouge-2']['f'] for score in rouge_scores]) / len(rouge_scores)
avg_rouge_l = sum([score['rouge-l']['f'] for score in rouge_scores]) / len(rouge_scores)

print(f"Average BLEU Score: {sum(bleu_scores) / len(bleu_scores):.4f}")
print(f"Average METEOR Score: {sum(meteor_scores) / len(meteor_scores):.4f}")
print(f"Average ChrF Score: {sum(chrf_scores) / len(chrf_scores):.4f}")
print(f"Average TER Score: {sum(ter_scores) / len(ter_scores):.4f}")
print(f"Average BERTScore F1: {sum(bert_scores) / len(bert_scores):.4f}")
print(f"Average BLEURT Score: {sum(bleurt_scores) / len(bleurt_scores):.4f}")
print(f"Average BARTScore: {sum(bart_scores) / len(bart_scores):.4f}")
print(f"Average ROUGE-1 F1 Score: {avg_rouge_1:.4f}")
print(f"Average ROUGE-2 F1 Score: {avg_rouge_2:.4f}")
print(f"Average ROUGE-L F1 Score: {avg_rouge_l:.4f}")

# Calculate accuracy
accuracy = (correct_predictions / total_processed_claims) * 100 if total_processed_claims > 0 else 0
print(f"Model Accuracy: {accuracy:.2f}%")

# Calculate the F1 score
f1 = f1_score(actual_labels, predicted_labels, average='weighted', labels=["true", "fake"])
print(f"F1 Score Average: {f1}")

# Confusion matrix
cm = confusion_matrix(actual_labels, predicted_labels, labels=["true", "fake"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["true", "fake"])
disp.plot(cmap=plt.cm.Blues)
plt.show()