Imports and installs

In [None]:
!pip install langchain==0.1.9 --quiet

In [None]:
!pip install -U langchain-community

In [None]:
!pip install pydantic==1.10.8

In [None]:
!pip install wikiextractor

In [None]:
!pip install ragatouille

In [None]:
!pip install chromadb

In [None]:
!pip install datasets

In [None]:
!unzip nlp_proj.zip

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

In [None]:
import numpy as np
from evaluate import load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import json
import pandas as pd
import numpy as np
import csv
import datasets
from datetime import datetime
from typing import Optional
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.vectorstores import VectorStore
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.llms import HuggingFacePipeline
from transformers import GenerationConfig, pipeline
from huggingface_hub import InferenceClient
from tqdm.auto import tqdm
from langchain.vectorstores import Chroma
from ragatouille import RAGPretrainedModel
from langchain.schema.retriever import BaseRetriever
from langchain.prompts import PromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

In [None]:
def parse_dict(d):
  s = ""
  for k, v in d.items():
    if isinstance(v, dict):
      s += str(k) + " " + parse_dict(v)
    if isinstance(v, list):
      s += str(k) + " " + " ".join(f"{i}" for i in v)
    else:
      s += str(k) + " " + str(v)
  return s

In [None]:
def process_json(json_file):
  data_array = []
  with open(json_file) as f:
    data_bag = json.load(f)
    for data in data_bag:
      data_array.append(Document(page_content=parse_dict(data['infobox']), metadata={'date_created': data['timestamp'][:10]}))
  return data_array

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)

In [None]:
model_kwargs = model_kwargs = {'device':'cuda'}
embeddings = HuggingFaceEmbeddings(model_kwargs = model_kwargs)

In [None]:
vectorstore = Chroma(persist_directory='/content/nlp_proj', embedding_function=embeddings)

The code below is for first-time setup of the vectorstore - after it is saved, we just load it from persist directory

In [None]:
"""vectorstore = Chroma(persist_directory='/content/nlp_proj', embedding_function=embeddings)

In [None]:
"""rootdir = ('/content/text')

text_info = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        filepath = subdir + os.sep + file
        text_info = text_info + process_json(filepath)"""

In [None]:
rootdir = '/content/drive/MyDrive/jsons'

In [None]:
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        print("Preparing file: ", file)
        filepath = subdir + os.sep + file
        text_data = process_json(filepath)
        chunked_text_data = text_splitter.split_documents(text_data)
        vectorstore.add_documents(documents=chunked_text_data)
        vectorstore.persist()

Retriever

In [None]:
from langchain.vectorstores import VectorStore
from langchain.chat_models import ChatOpenAI
from langchain.schema.retriever import BaseRetriever
from scipy.spatial.distance import cdist
import time
import datetime

class CustomVectorStoreRetriever(BaseRetriever):

    def time_score(self, metadatas, query_t, v_mean, v_std):
      alpha = 0.2
      d_time = []
      for x in metadatas:
        if x is None:
          d_time.append(time.mktime(datetime.datetime.strptime('1970-01-01' , "%Y-%m-%d").timetuple()))
        else:
          d_time.append(time.mktime(datetime.datetime.strptime(x['date_created'], "%Y-%m-%d").timetuple()))
      q_time = time.mktime(datetime.datetime.strptime(query_t, "%Y-%m-%d").timetuple())
      ts = np.array([alpha/(q_time - d_t) for d_t in d_time])
      reg_ts = [((x - np.std(ts)) / np.mean(ts)) * v_std + v_mean for x in ts]
      return reg_ts


    def cosine_sim_score(self, query_vector, doc_vector):
        # Calculate cosine similarity between the vectors

        return 1. - cdist(np.array(query_vector).reshape(1, -1), np.array(doc_vector).reshape(1, -1), 'cosine')

    vector_store: VectorStore
    vector_temp_score_fn = time_score
    query_date: str = '2025-01-12'
    top_k: int
    include_temp: bool = False

    def _get_relevant_documents(self, query):
        # Embed the query to get its vector representation

        query_vector_id = embeddings.embed_query(query)

        # List to store documents, their respective scores, and metadata
        doc_scores = []

        doc_embeddings = self.vector_store.get(include=['embeddings'])['embeddings']
        # query_embedding = self.vector_store.get(ids=query_vector_id, include=['embeddings'])['embeddings']

        doc_texts = self.vector_store.get()['documents']

        metadatas = self.vector_store.get()['metadatas']

        for i in range (0, len(doc_embeddings)):
          doc_scores.append([doc_texts[i], self.cosine_sim_score(query_vector_id, doc_embeddings[i])[0][0]])

        arr = np.array([x[1] for x in doc_scores])


        if self.include_temp == True:

          time_vals = self.time_score(metadatas, self.query_date, np.mean(arr), np.std(arr))

          a = np.add(arr, time_vals)
          for i in range (0, len(doc_embeddings)):
            doc_scores[i][1] = a[i]

        # Sort documents by the score in descending order
        sorted_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)

        # Return the documents sorted by similarity
        return [Document(page_content=doc) for doc, _ in sorted_docs[:self.top_k]]


Model

In [None]:
prompt_template = """
<|system|></s>
<|user|>
Additional context:
{context}
---
Here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [None]:
from langchain.chains import RetrievalQA

def answer_question(question, model_name = "google/flan-t5-large", k = 1, prompt = prompt, reranker: Optional[RAGPretrainedModel] = None):

  if model_name == "google/flan-t5-large":
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

  gen_cfg = GenerationConfig.from_pretrained(model_name)

  pipe=pipeline(
      task="text2text-generation",
      model=model,
      tokenizer=tokenizer,
      generation_config=gen_cfg
  )

  llm = HuggingFacePipeline(pipeline=pipe)

  chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=CustomVectorStoreRetriever(vector_store = vectorstore, top_k = 10, include_temp = True),
    chain_type_kwargs={"prompt": prompt},
)
  result = chain.invoke(query)
  return result

Obtaining results

In [None]:
questions_base = []
f = open('test_TLQA.json')
data = json.load(f)
for i in data:
    questions_base.append(i['question'])

# Closing file
f.close()

In [None]:
results = []
for i in range(len(questions_base)):
  query = questions_base[i]
  result = answer_question(query)
  print(result['result'].strip())
  results.append(result['result'].strip())

In [None]:
file = open('results_k10_yes.txt','w')
for item in results:
    file.write(item+"\n")
file.close()

In [None]:
import json
actual_answers = []
f = open('test_processed.json')
data = json.load(f)
for i in data:
    actual_answers.append(i['output'])
# Closing file
f.close()

In [None]:
predicted_answers = []
with open('results_k10_yes.txt', 'r') as file:
    # Read each line in the file
    for line in file:
        # Print each line
        predicted_answers.append(line.strip())

Evaluation

In [None]:
def parse_entities_and_timelines(output):
    entities = []
    for item in output.split(","):
        item = item.strip()
        if "(" in item and ")" in item:
            entity, timeline = item.rsplit("(", 1)
            entities.append((entity.strip(), timeline.strip(")")))
        else:
            entities.append((item.strip(), None))
    return entities

In [None]:
# Helper function for BLEU and ROUGE metrics
class TLQAMetrics:
    def evaluate_predictions(self, predictions, references):
        """Evaluate BLEU and ROUGE scores."""
        # Load metrics
        bleu = load('bleu')
        rouge = load('rouge')

        # Compute BLEU and ROUGE
        bleu_scores = bleu.compute(predictions=predictions, references=references)
        rouge_scores = rouge.compute(predictions=predictions, references=references)

        return {
            "BLEU": bleu_scores,
            "ROUGE": rouge_scores,
        }

# Initialize BLEU/ROUGE evaluator
metrics = TLQAMetrics()

In [None]:
total_true_positives = 0
total_false_positives = 0
total_false_negatives = 0
total_timeline_matches = 0
total_timeline_mismatches = 0
total_ground_truth_entities = 0

# Lists to store sample-wise (macro) metrics
sample_precisions = []
sample_recalls = []
sample_f1s = []
timeline_matches = []
timeline_mismatches = []

# Lists for BLEU and ROUGE references and predictions
references = []
sample_predictions = []

# Prepare output lines for sample-wise and global results
macro_output_lines = []
micro_output_lines = []


for i in range (len(actual_answers)):
    # Parse ground truth and prediction
    ground_truth = set(parse_entities_and_timelines(actual_answers[i]))
    predicted = set(parse_entities_and_timelines(predicted_answers[i]))

    # Extract entities and timelines separately
    ground_truth_entities = {entity for entity, _ in ground_truth}
    predicted_entities = {entity for entity, _ in predicted}

    # Add reference and prediction for BLEU/ROUGE
    references.append(actual_answers[i])
    sample_predictions.append(predicted_answers[i])

    # Calculate matches
    true_positives = ground_truth_entities & predicted_entities
    false_positives = predicted_entities - ground_truth_entities
    false_negatives = ground_truth_entities - predicted_entities

    # Precision, recall, F1 for the current sample
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if len(true_positives) + len(false_positives) > 0 else 0
    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if len(true_positives) + len(false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    # Append sample-wise metrics
    sample_precisions.append(precision)
    sample_recalls.append(recall)
    sample_f1s.append(f1)

    # Evaluate timelines for matched entities
    sample_timeline_matches = 0
    sample_timeline_mismatches = 0
    for entity, timeline in ground_truth:
        if entity in predicted_entities:
            predicted_timeline = next((t for e, t in predicted if e == entity), None)
            if timeline == predicted_timeline:
                sample_timeline_matches += 1
            else:
                sample_timeline_mismatches += 1

    timeline_matches.append(sample_timeline_matches)
    timeline_mismatches.append(sample_timeline_mismatches)

    # Aggregate metrics for micro-averaging
    total_true_positives += len(true_positives)
    total_false_positives += len(false_positives)
    total_false_negatives += len(false_negatives)
    total_timeline_matches += sample_timeline_matches
    total_timeline_mismatches += sample_timeline_mismatches
    total_ground_truth_entities += len(ground_truth_entities)

    # Prepare sample-wise evaluation details for macro output
    macro_output_lines.append(f"Sample {i + 1}:")
    macro_output_lines.append(f"Input: {questions_base[i]}")
    macro_output_lines.append(f"Ground Truth: {ground_truth}")
    macro_output_lines.append(f"Prediction: {predicted}")
    macro_output_lines.append(f"Precision: {precision:.4f}")
    macro_output_lines.append(f"Recall: {recall:.4f}")
    macro_output_lines.append(f"F1-Score: {f1:.4f}")
    macro_output_lines.append(f"Timeline Matches: {sample_timeline_matches}")
    macro_output_lines.append(f"Timeline Mismatches: {sample_timeline_mismatches}")
    macro_output_lines.append("")

macro_precision = np.mean(sample_precisions)
macro_recall = np.mean(sample_recalls)
macro_f1 = np.mean(sample_f1s)

# Calculate overall timeline accuracy for macro results
macro_timeline_accuracy = sum(timeline_matches) / (sum(timeline_matches) + sum(timeline_mismatches)) if sum(timeline_matches) + sum(timeline_mismatches) > 0 else 0

# Append macro-averaged metrics to macro output
macro_output_lines.append("Global Macro Metrics:")
macro_output_lines.append(f"Macro Precision (Entities): {macro_precision:.4f}")
macro_output_lines.append(f"Macro Recall (Entities): {macro_recall:.4f}")
macro_output_lines.append(f"Macro F1-Score (Entities): {macro_f1:.4f}")
macro_output_lines.append(f"Macro Timeline Accuracy: {macro_timeline_accuracy:.4f}")

# Calculate global (micro) metrics
micro_precision = total_true_positives / (total_true_positives + total_false_positives) if total_true_positives + total_false_positives > 0 else 0
micro_recall = total_true_positives / (total_true_positives + total_false_negatives) if total_true_positives + total_false_negatives > 0 else 0
micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall) if micro_precision + micro_recall > 0 else 0

micro_timeline_accuracy = total_timeline_matches / (total_timeline_matches + total_timeline_mismatches) if total_timeline_matches + total_timeline_mismatches > 0 else 0
completeness = total_true_positives / total_ground_truth_entities if total_ground_truth_entities > 0 else 0

# Append global metrics to micro output
micro_output_lines.append("Global Micro Metrics:")
micro_output_lines.append(f"Micro Precision (Entities): {micro_precision:.4f}")
micro_output_lines.append(f"Micro Recall (Entities): {micro_recall:.4f}")
micro_output_lines.append(f"Micro F1-Score (Entities): {micro_f1:.4f}")
micro_output_lines.append(f"Micro Timeline Accuracy: {micro_timeline_accuracy:.4f}")
micro_output_lines.append(f"Completeness: {completeness:.4f}")

# Evaluate BLEU and ROUGE scores
bleu_rouge_results = metrics.evaluate_predictions(sample_predictions, references)

# Append BLEU and ROUGE to macro output
macro_output_lines.append("Global BLEU and ROUGE Metrics:")
macro_output_lines.append(f"BLEU: {bleu_rouge_results['BLEU']}")
macro_output_lines.append(f"ROUGE: {bleu_rouge_results['ROUGE']}")

# Save macro results to a file
with open("k10_yes_evaluation_output_macro.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(macro_output_lines))

# Save micro results to a separate file
with open("k10_yes_evaluation_output_micro.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(micro_output_lines))

print("Evaluation results saved to 'evaluation_output_macro.txt' and 'evaluation_output_micro.txt'.")