In [6]:
# Load API keys securely
from dotenv import load_dotenv
import os
load_dotenv()

os.environ["RITS_API_KEY"] = os.getenv("RITS_API_KEY")
print (os.environ["RITS_API_KEY"])
# Core libraries
import json
import os
import re
import sys
import io
import time
import random
import requests
import operator
import numpy as np
import pandas as pd
from typing import TypedDict, Annotated, List
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import math
# NLP + date parsing
import dateparser
import tiktoken

# PDF processing
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text

# HTML parsing
from bs4 import BeautifulSoup
import ast
# Text preprocessing
import swifter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# LangChain: modular imports (latest versions)
from langchain_openai import ChatOpenAI, OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langgraph.graph import StateGraph, END
from serpapi import GoogleSearch
import pickle


315a8d1abd944e888591cb7a058a50ae


In [1]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [7]:
# %%bash
# echo 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 > .huggingface

In [None]:
from huggingface_hub import login

# Paste your Hugging Face token here


In [9]:
import tiktoken




# Hi this is my text

In [10]:
rits_api_key = os.environ["RITS_API_KEY"]
TIMEOUT = 40

model_name = 'deepseek-ai/DeepSeek-V3'
# find the map of huggingface model name
def get_rits_model_list():
    url = "https://rits.fmaas.res.ibm.com/ritsapi/inferenceinfo"
    response = requests.get(url, headers={"RITS_API_KEY": rits_api_key})
    if response.status_code == 200:
        return {m["model_name"]: m["endpoint"] for m in response.json()}
    else:
        raise Exception(f"Failed getting RITS model list:\n\n{response.text}")


# basic model definitions
print("getting models")
# model_id = 'deepseek-ai/DeepSeek-V3'
# model_id='meta-llama/llama-4-scout-17b-16e'
# #model_id='meta-llama/llama-3-3-70b-instruct-embeddings'
# model_id="ibm-granite/granite-3.1-8b-instruct"
# model_id='mistralai/mixtral-8x22B-instruct-v0.1'

model_urls = get_rits_model_list()

# url = 'https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/deepseek-v3'
url = model_urls[model_id]
url

getting models


NameError: name 'model_id' is not defined

In [None]:
from transformers import AutoTokenizer#, LlamaTokenizer

encoding = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x22B-Instruct-v0.1")
os.environ["TOKENIZERS_PARALLELISM"] = "true"


In [None]:


# === Define the state schema ===

class State(TypedDict):
    text: str
    topic: str
    questions: Annotated[list[str], operator.add]
    claim: str
    date: str
    search_query: str    
    search_results: List[str]
    summaries: str
    num_sentences: int
    relevant_sentences: List[str]
    verifier_result: str
    generated_evidence: List[str]
    qa_pairs: str
    
# === Initialize the language model ===
# === LLM setup ===
llm = ChatOpenAI(
    model=model_id,
    temperature=0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)


# === Define the fixed list of topics ===

# === Classifier chain ===
classifier_prompt = PromptTemplate(
    input_variables=["text", "topic_list"],
    template=(
        "You are a topic classifier. Given the text below, classify it "
        "into one of these topics: {topic_list}.\n\n"
        "Put the result in **, as in **result**"
        "Text: \"{text}\"\n"
        "Topic:"
    ),
)
classifier_chain = classifier_prompt | llm



In [None]:
# === Analyzer chains: break claim into verifiable questions ===
def make_question_generator_chain(topic):
    prompt = PromptTemplate(
        input_variables=["text"],
        template=(
            f"You are an expert in {topic}. Given the following claim, break it down into individual verifiable questions. "
            f"Every phrase or assertion should become a question that can be fact-checked.\n\n"
            f"Do not generate questions that inquire about information that is not mentioned in the question.\n\n"
            f"Do not generate questions that cannot be used to verify or refute the claim.\n\n"
            f"Claim: \"{{text}}\"\n\n"
            f"Questions:"
        )
    )
    return prompt | llm  

analyzer_chains = {topic: make_question_generator_chain(topic) for topic in topics}

# === Define LangGraph nodes ===
def classifier_node(state: State) -> dict:
    text = state['text']
    result = classifier_chain.invoke({"text": text, "topic_list": ", ".join(topics)})
    result = result.content.strip()
    results = re.findall(r'\*\*(.*)\*\*', result)
    if results:
        topic = results[-1]
    else:
        topic = 'Unspecified topic'
    
    return {"topic": topic}

def analyzer_node(state: State) -> dict:
    topic = state['topic']
    text = state['text']
    analyzer_chain = analyzer_chains.get(topic)
    if analyzer_chain:
        questions_output = analyzer_chain.invoke({"text": text}).content.strip() 
        questions = [q.strip() for q in questions_output.split('\n') if q.strip()]
        return {"questions": questions}
    else:
        return {"questions": [f"No analyzer found for topic: {topic}"]}


In [None]:
def parse_dates(x):
    try:
        return dateparser.parse(str(x)).strftime("%m/%d/%Y")
    except:
        return x

# Obtain evidence from web 

In [None]:
media_extensions = [".jpg", ".jpeg", ".png", ".gif", ".mp4", ".mov", ".avi"]


In [None]:
def extract_pdf_text(pdf_url):
    response = requests.get(pdf_url, timeout=(3,10))
    
    if response.status_code == 200:
        content_type = response.headers.get('Content-Type', '')
        # print("Content-Type:", content_type)
        # print("Content-Length:", len(response.content))
        
        # if 'pdf' not in content_type.lower():
        #     print("Warning: This does not look like a PDF file!")
        pdf_data = response.content    
        doc = fitz.open(stream=pdf_data, filetype="pdf")
        
        full_text = ""
        for page in doc:
            full_text += page.get_text()
        
        return full_text
    #else:
        
        #print(f"Failed to download. Status code: {response.status_code}")
   

In [None]:
# Load the search tool using SerpAPI
def process_results(res):
    title = res.get("title", "")
    link = res.get("link", "")
    #print (link)
    if not ("Just a moment" in title or "Enable JavaScript" in title):
        try:
            if res['link'].endswith(".pdf"):
                text = extract_pdf_text(res['link'])
                #print (text[:50])
                search_result_2 = re.sub(r'\n\s*\n','\n\n',text.strip())
                search_result_3 = re.sub(r'\n\s*\w+(\s+\w+)?\s*\n','',search_result_2)
                return search_result_3

            elif not (any(res['link'].endswith(ext) for ext in media_extensions)):                        
                response = requests.get(res['link'],timeout=(3, 10))
                # Parse the HTML content
                soup = BeautifulSoup(response.text, 'html.parser')
                # Get all text from the page
                text = soup.get_text()
                #print (text[:50])
                search_result_2 = re.sub(r'\n\s*\n','\n\n',text.strip())
                search_result_3 = re.sub(r'\n\s*\w+(\s+\w+)?\s*\n','',search_result_2)
                return search_result_3
        except Exception as e:
            print (e)
            print (res['link'])
            return ''
def search_query(query,from_str):
    if verbose:
        print ('query', query)
    params = {
      "q": query,       # Your search query
      "hl": "en",                       # Language
      "gl": "us",                       # Region
      "api_key": "710dff5ac2a065d1016d49de2018adce28bd01a913e40279d8bd9a059c9923f6",   # Replace with your actual API key
       "num": max_hits_per_query,
      "tbs": f"cdr:1,cd_min:{from_str},cd_max:12/31/2099",
      'engine':search_engine
    }
    # print (params,query)
    search = GoogleSearch(params)
    all_results = []

    try:
        results = search.get_dict()
        # Print top organic results
        #print (results)
        organic_results =  results.get("organic_results", [])
        
        # Send a GET request
        if verbose:
            print ("len results", len(organic_results))

        if len(organic_results) > 0:
            pool = Pool(processes=min(len(organic_results),max_hits_per_query))
            all_results = pool.map(process_results,organic_results)
                #print (res)
    except Exception as e:
        #print (e)
        pass
        
                
    return all_results

In [None]:
# === State definition ===


# === LLM setup ===
first_attempt_llm = ChatOpenAI(
    model=model_id,
    temperature=0.0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

second_attempt_llm = ChatOpenAI(
    model=model_id,
    temperature=0.1,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

# === Step 1: Query Builder ===
query_first_attempt_prompt = PromptTemplate(
    template=(
        "You are a search query expert. Given the claim:\n\"{claim}\"\n"
        "generate a Google search query that will help find evidence to verify or refute this claim. \n"
        "don't put quotes on the entire query, we are not looking for exact match for the query.\n"
        "Don't use everything if the claim is long, summarize it.\n"
        "put commas between the keyphrases you generated"
        "Return a json with only one field 'query', no explanation in the output\n"
        "\n\nJSON:"
    )
)
query_second_attempt_prompt = PromptTemplate(
    template=(
        "You are a search query expert. Given the claim:\n\"{claim}\"\n"
        "generate a Google search query that will help find evidence to verify or refute this claim. \n"
        "don't put quotes on the entire query, we are not looking for exact match for the query.\n"
        "Don't use everything if the claim is long, summarize it.\n"
        "The query must be at most 7 tokens long.\n"
        "put commas between the keyphrases you generated"
        "do not generate this query: \"{search_query}\""
        "Return a json with only one field 'query', no explanation in the output\n"
        "\n\nJSON:"
    )
)
query_builder_first_attempt_chain = query_first_attempt_prompt | first_attempt_llm
query_builder_second_attempt_chain = query_second_attempt_prompt | second_attempt_llm

def query_builder_node(state: State) -> dict:
    
    state['questions'] = set(state['questions'])
    query = ''
    #print ("In query builder\n", state)
    if state['search_query'] == '':
        result  = query_builder_first_attempt_chain.invoke({
            "claim": state['claim'],
            'search_query':state['search_query']
        })
    else:
        result  = query_builder_second_attempt_chain.invoke({
            "claim": state['claim'],
            'search_query':state['search_query']
        })    
    try:
        query = result.content.strip()
        query = query.strip('`').replace("json\n", "", 1).strip()
        query = re.match(r'.*(\{.+\}).*', query,re.DOTALL).group(1)
        query = re.match(r'\{.*?\:(.*)\}', query,re.DOTALL).group(1)
        # parsed = json.loads(query)
        # query = parsed['query']
    except Exception as e:
        print (e)
        print (query)
        pass
    return {"search_query": query}

# === Step 2: Search Execution (SerpAPI) ===
#from langchain_community.utilities import SerpAPIWrapper

#search_tool = SerpAPIWrapper()

def search_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    query = state['search_query']
    from_str = parse_dates(state['date'])
    results = search_query(query,from_str)  
    return {"search_results": results}


        

In [None]:

def get_chunks_to_summarize(long_text) : 
    # Initialize tokenizer (for GPT-4o)
    
    # Helper: count tokens in text
    def count_tokens(text):
        return len(encoding.encode(text))
    
    # Define safe token window (adjust for your summarizer)
 # safe per summarizer call
     # overlap for coherence
    
    # Convert token targets to rough character sizes (~4 chars/token conservative)
    chunk_size_chars = max_chunk_tokens * 4
    chunk_overlap_chars = desired_overlap_tokens * 4
    
    # Initialize recursive splitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size_chars,
        chunk_overlap=chunk_overlap_chars
    )

    try:
        # Step 1: Character-based split
        char_chunks = splitter.split_text(long_text)
        
        # Step 2: Validate and trim by token count (if needed)
        final_chunks = []
        for chunk in char_chunks:
            token_count = count_tokens(chunk)
            if token_count <= max_chunk_tokens:
                final_chunks.append(chunk)
            else:
                #print(f"Warning: Oversized chunk ({token_count} tokens), trimming...")
                # Optionally trim further by tokens
                tokens = encoding.encode(chunk)
                trimmed = encoding.decode(tokens[:max_chunk_tokens])
                final_chunks.append(trimmed)
        
        # Report
        # print(f"Prepared {len(final_chunks)} chunks for summarization.")
        total_tokens = sum([count_tokens(c) for i, c in enumerate(final_chunks)])
        return final_chunks, total_tokens
    except Exception as e:
        # print (e)
        return [],0




# === Step 3: Summarize node ===
summarize_prompt = PromptTemplate(
    input_variables=["claim", "chunk", "num_tokens", 'questions'],
    template=(
        "You are an summarizer. When you summairze keep this in mind: \n"
        "Given a claim and questions about the claim "
        "there are web search results in which we will look for evidence to "
        "refute or verify the claim through finding answer to the questions. "
        "The results can be very long, so they have to be summarized. "
        "So, given this claim:\n\"{claim}\"\n"
        "and the following questions about the claim {questions}\n, go through this search result:\n"
        "{chunk}\"\n"
        "summarize the result to {num_tokens}."
        "Do not try to verify of refute the claim, only summarize it without dropping information that"
        " can be used to answer these questions.\n"
        "If document is corrupted and cannot be summaried, return empty string. "
        "\n\nSummary:"
    )
)

def summarize_chunks(chunks,percent_reduce,result,state):
    try:
        if len(chunks) == 0:
            return ""
        llm = ChatOpenAI(
            model=model_id,
            temperature=0,
            api_key="/",
            base_url=f'{url}/v1',
            default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
            timeout=TIMEOUT
        )
        if percent_reduce == 1:
            return result

        time.sleep(5/random.randint(1, 10))
        summarizer_chain = summarize_prompt | llm
        summaries = []
        percent_reduce = min(percent_reduce,1)
        #print ("len chunks to summarize", len(chunks))
        for chunk in chunks:
            len_tokens  = len(encoding.encode(chunk))
            num_tokens = min(max(int(len_tokens * percent_reduce),10),len_tokens)
            #print ("num_tokens", num_tokens, "len_tokens", len_tokens, "percent_reduce", percent_reduce)
            
            result = summarizer_chain.invoke({
            "claim": state['claim'],
            "chunk": chunk,
            "num_tokens": num_tokens ,
            "questions": "\n".join(set(state['questions'])) # Pass as string if needed
            })
            summary = result.content.strip()
            summaries.append(summary)
        return "\n".join(summaries)   
    except Exception as e:
        print (e)
        return ''

def summarizer_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    results = state['search_results']
    all_chunks = []
    all_total_tokens = 0 
    percent_reduce = np.inf
    token_count = []
    for result in results:
        chunks,total_tokens = get_chunks_to_summarize(result)
        if total_tokens > 0:
            all_total_tokens += total_tokens
            token_count.append(total_tokens)
            all_chunks.append(chunks)   
            if all_total_tokens > 0:
                percent_reduce = max_token_count / all_total_tokens
            if percent_reduce < 0.1:
                break
        
            
        #print ("len(all_chunks) ", len(all_chunks), "max_chunk_tokens ", \
        #   max_chunk_tokens, "percent_reduce" , percent_reduce)
    token_count =  np.array(token_count)
    token_percentages_per_result = np.array(token_count) / all_total_tokens

#     import numpy as np

# token_count = np.array([800, 1200, 500, 1500])  # example token counts per result
# all_total_tokens = token_count.sum()
# max_token_count = 1000  # your allowed total after reduction
    if all_total_tokens == 0:
        return {"summaries": "", 'num_sentences':0}
    # Step 1: Compute relative weights — more aggressive for longer chunks
    if verbose:
        print ("token_count", token_count)
    #print ("token_percentages_per_result",token_percentages_per_result)    
    inverse_lengths = 1 / np.emath.logn(max_chunk_tokens, token_count)
    weights = inverse_lengths / inverse_lengths.sum()
    #print ("weights",weights)
    target_tokens_per_result = weights * max_token_count
    # Step 2: Compute how many tokens each chunk gets (out of max_token_count)
    #print ("target_tokens_per_result",target_tokens_per_result)
    #print ("token_count",token_count)
    target_tokens_per_result =  np.min(np.vstack((target_tokens_per_result,token_count)),axis=0)
    #print ("target_tokens_per_result",target_tokens_per_result)
    leftout_tokens = max_token_count - target_tokens_per_result.sum()
    #print ("leftout_tokens",leftout_tokens)
    to_add_tokens = np.maximum(np.subtract(token_count,target_tokens_per_result),0)
    if to_add_tokens.sum() > 0:
        to_add_tokens = (to_add_tokens / to_add_tokens.sum()) * leftout_tokens
    #print ("to_add_tokens",to_add_tokens)
    target_tokens_per_result = (target_tokens_per_result + to_add_tokens).astype(int)
    if verbose:
        print ("target_tokens_per_result",target_tokens_per_result)

        
    #print ("target_tokens_per_result",target_tokens_per_result)
    # Step 3: Convert that to a reduction ratio (per result)
    percent_reduce_per_result = target_tokens_per_result / token_count
    #print ("percent_reduce_per_result",percent_reduce_per_result)
    # Optional: clip between [0.05, 1.0] to avoid extreme reductions
    percent_reduce_per_result = np.clip(percent_reduce_per_result, 0.01, 1.0)
    #print ("percent_reduce_per_result",percent_reduce_per_result)

    
    all_summaries = []
    if percent_reduce_per_result.min() < 1 and len(all_chunks) > 0:
        pool = Pool(processes=num_processes)
        all_summaries = pool.starmap(summarize_chunks,[(c,p,r,state) \
                                                       for c,p,r in zip(all_chunks,percent_reduce_per_result,results)])
        all_summaries = "\n".join(all_summaries)
    else:
        all_summaries = "\n".join([" ".join(c) for c in all_chunks])
    num_tokens = len(encoding.encode(all_summaries))
    num_sentences = int(min(num_tokens / max_token_count,1) * 100)
    #print ("num_sentences",num_sentences, "num_tokens",num_tokens,"MAX_OVERALL_TOKENS",MAX_OVERALL_TOKENS)
    return {"summaries": all_summaries, 'num_sentences':num_sentences}

In [None]:
# === Step 3: Relevant Sentence Extractor ===
extractor_prompt = PromptTemplate(
    input_variables=["claim", "summaries", "num_sentences" 'questions'],
    template=(
        "You are an evidence extractor. Given the claim:\n\"{claim}\"\n"
        "and the following questions about the claim \n\"{questions}\"\n, go through these search results:\n\"{summaries}\"\n"
        "extract the most relevant sentences that directly address or provide evidence about the claim."
        "extract at most {num_sentences} sentences, you don't have to extract anything if there is nothing relevant"
        "sort sentences in terms of relenavce so that most relevant comes first, and least comes last"
        "\n\nRelevant Sentences:"
    )
)
llm = ChatOpenAI(
    model=model_id,
    temperature=0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

extractor_chain = extractor_prompt | llm

def extractor_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    result = extractor_chain.invoke({
        "claim": state['claim'],
        "summaries": str(state['summaries']),
        "num_sentences": state['num_sentences'] ,
        "questions": "\n".join(state['questions']) # Pass as string if needed
    })
    sentences = result.content.strip().split('\n')
    return {"relevant_sentences": [s.strip() for s in sentences if s.strip()]}

In [None]:
# === Step 3: Relevant Sentence Extractor ===
evidence_gen_prompt = PromptTemplate(
    input_variables=["claim", 'questions'],
    template=(
        "You are an evidence generator. Given the claim:\n\"{claim}\"\n"
        "and the following questions about the claim {questions}\n"
        "you will search your knowldge base for statements that can be used to verify or refute the claim.\n"
        "Only provide answers to the questions. If you don't have an aswer in your knowledge base, you don't have to generate an answer.\n"
        "Only generate sentences that you are very strongly confident are true.\n"
        "If there is no evidence in your knowledge base that strongly supports or refutes the claim, return \"NONE\". \n"
        "Don't forget, lack of evidence does not mean the claim is false or refuted. \n"
        "\nEvidence:"
    )
)
llm = ChatOpenAI(
    model=model_id,
    temperature=0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

evidence_gen_chain = evidence_gen_prompt | llm

def evidence_gen_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    result = evidence_gen_chain.invoke({
        "claim": state['claim'],
        "questions": "\n".join(state['questions']) # Pass as string if needed
    })
    sentences = result.content.strip().split('\n')
    return {"generated_evidence": [s.strip() for s in sentences if s.strip()]}

In [None]:
# === Step 3: Relevant Sentence Extractor ===
qa_prompt = PromptTemplate(
    input_variables=["questions",'relevant_sentences', "generated_evidence",],
    template=(
        "Given a list of questions and relevant sentences,\n"
        "you will find a mapping between questions and sentences, and generate question-answer pairs. \n"
        "You will provide an answer to each question using the evidence provided. \n"
        "Do not generate answers that are not provided in the evidence. Use only the provided evidence\n"
        "if a question cannot be answered using the provided evidence, the answer should be \"No answer found\". \n"
        "if a question has multiple answers in the given list, pick only the most relevant answer. \n"
        "Split the question and its answer by three tabs. For example:\n"
        "\"Did Obama die in 2018?\t\t\tObama gave a speech in 2021, therefore he was still alive in 2018.\" \n"
        "Here are the questions: \n{questions}\n"
        "Here is the evidence :  \n{relevant_sentences}, {generated_evidence}\n"
        "Put your result in a json, NOT prose\n"
        "JSON: "
    )
)
llm = ChatOpenAI(
    model=model_id,
    temperature=0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

qa_chain = qa_prompt | llm

def qa_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    result = qa_chain.invoke({
        "questions": "\n".join(state['questions']), # Pass as string if needed
        "relevant_sentences": "\n".join(state['relevant_sentences']),
        "generated_evidence": "\n".join(state['generated_evidence'])
    })
    result = result.content.strip()
    result = result.strip('`').replace("json\n", "", 1).strip()
    return {"qa_pairs": result}

In [None]:
# === Step 3: Relevant Sentence Extractor ===
verifier_prompt = PromptTemplate(
    input_variables=["claim", "qa_pairs"],
    template=(
        "You are a claim verifier. Given the claim:\n\"{claim}\"\n"
        "and the following question answer pairs seperated by tabs \n\"{qa_pairs}\"\n\n"
        "Classify whether the evidence supports the claim, whether it refutes the claim, whethere there is not enough evidence for either, or there is conflicting evidence.\n"
        " A single contradiction is enough to refute the claim, while full support requires consistent verification of all sub-parts. "
        "If the same questions has conflicting answers, then there is conflicting evidence.\n"
        "Result in JSON with fields result and explanation.\n"
        "The result will be either one of these tokens = supports, refutes, not_enough_evidence, conflicting_evidence.\n"      
        "JSON: "
    )
)
llm = ChatOpenAI(
    model=model_id,
    temperature=0.0,
    api_key="/",
    base_url=f'{url}/v1',
    default_headers={'RITS_API_KEY': os.getenv("RITS_API_KEY")},
    timeout=TIMEOUT
)

verifier_chain = verifier_prompt | llm

def verifier_node(state: State) -> dict:
    state['questions'] = set(state['questions'])
    result = verifier_chain.invoke({
        "claim": state['claim'],
        "qa_pairs":state['qa_pairs']
    })
    result = result.content.strip()
    return {"verifier_result": result}

In [None]:
def search_loop_router(state: State) -> dict:
    if state['search_results'] == []:
        return "search_again"
    else:
        return "summarize"
def summarize_loop_router(state: State) -> dict:
    if state['summaries'] == "":
        return "search_again"
    else:
        return "extract"

In [None]:
        
graph_builder = StateGraph(State)
graph_builder.add_node("classifier", classifier_node)
graph_builder.add_node("analyzer", analyzer_node)
graph_builder.add_node("query_builder", query_builder_node)
graph_builder.add_node("search", search_node)
graph_builder.add_node("search_loop_node", lambda x: x)
graph_builder.add_node("summarize_loop_node", lambda x: x)

graph_builder.add_node("summarizer", summarizer_node)
#graph_builder.add_node("evidence_gen", evidence_gen_node)
graph_builder.add_node("extractor", extractor_node)
graph_builder.add_node("qa", qa_node)
graph_builder.add_node("verifier", verifier_node)

graph_builder.add_edge("classifier", "analyzer")
graph_builder.add_edge("analyzer", "query_builder")
graph_builder.add_edge("query_builder", "search")
graph_builder.add_edge("search", "search_loop_node")
graph_builder.add_conditional_edges(
    "search_loop_node",   # node name (as string)
    search_loop_router,   # router function
    {
        "search_again": "query_builder",
        "summarize": "summarizer",
    }
)
graph_builder.add_edge("summarizer", "summarize_loop_node")

graph_builder.add_conditional_edges(
    "summarize_loop_node",   # node name (as string)
    summarize_loop_router,   # router function
    {
        "search_again": "query_builder",
        "extract": "extractor",
    }
)

graph_builder.add_edge("extractor",'qa')
#graph_builder.add_edge("evidence_gen",'qa')
graph_builder.add_edge("qa",'verifier')

# graph_builder.set_finish_point("search")
#graph_builder.set_finish_point("summarizer")
graph_builder.set_entry_point("classifier")
graph_builder.set_finish_point("verifier")

graph = graph_builder.compile()

In [None]:
def get_search_results(row):
    #print (row)
    initial_state = {
        "text" : row['claim'],
        "claim": row['claim'],
        "topic":"",
        "date": row['claim_date'],
        "questions": [],#re.search(r'questions(.*)$',row['topics_questions']).group(1).strip('{}[]'),
        'search_query': '',
        "search_results": [],
        "num_sentences": 0,
        "summaries": "",
        "relevant_sentences": [],
        "generated_evidence": [],
        "verifier_result":''

    }
    return State(initial_state)
