In [3]:
pip install langchain langchain_groq faiss-cpu transformers sentence-transformers




In [7]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [17]:
import pandas as pd

# Load LIAR dataset
import pandas as pd

df = pd.read_csv("train.tsv", sep='\t', header=None, names=[
    "id",                # 0
    "label",             # 1
    "statement",         # 2
    "subject",           # 3
    "speaker",           # 4
    "speaker_title",     # 5
    "speaker_state",     # 6
    "speaker_party",     # 7
    "before_true",       # 8
    "before_false",      # 9
    "before_barely_true",# 10
    "before_half_true",  # 11
    "before_mostly_true",# 12
    "before_pants_on_fire", # 13
    "context"            # 14
])


# Map 6-class labels to binary
label_map = {
    "true": "true",
    "mostly-true": "true",
    "half-true": "true",
    "barely-true": "false",
    "false": "false",
    "pants-fire": "false"
}
df["binary_label"] = df["label"].map(label_map)
df["statement_text"] = df["statement"]

# Save only needed columns
data = df[[
    "id",
    "label",
    "binary_label",
    "statement",
    "statement_text",
    "subject",
    "speaker",
    "speaker_title",
    "speaker_state",
    "speaker_party",
    "before_true",
    "before_false",
    "before_barely_true",
    "before_half_true",
    "before_mostly_true",
    "before_pants_on_fire",
    "context"
]]



In [18]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

# Use sentence transformer embeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Convert each row to a Document with full metadata
documents = [
    Document(
        page_content=row["statement"],  # Main text for similarity search
        metadata={
            "id": row["id"],
            "label": row["label"],
            "subject": row["subject"],
            "speaker": row["speaker"],
            "speaker_title": row["speaker_title"],
            "speaker_state": row["speaker_state"],
            "speaker_party": row["speaker_party"],
            "before_true": row["before_true"],
            "before_false": row["before_false"],
            "before_barely_true": row["before_barely_true"],
            "before_half_true": row["before_half_true"],
            "before_mostly_true": row["before_mostly_true"],
            "before_pants_on_fire": row["before_pants_on_fire"],
            "context": row["context"]
        }
    )
    for _, row in data.iterrows()
]



# Store in FAISS Vector DB
vector_db = FAISS.from_documents(documents, embedding)
vector_db.save_local("liar_vector_db")

In [33]:
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS

def detect_fake_news(
    new_claim: str,
    embedding,
    vector_db_path: str,
    api_key: str,
    k: int = 3,
    claim_metadata: dict = None
) -> str:
    # Load Vector DB
    vector_db = FAISS.load_local(
        folder_path=vector_db_path,
        embeddings=embedding,
        allow_dangerous_deserialization=True
    )

    # Initialize Groq LLM
    llm = ChatGroq(api_key=api_key, model="llama3-8b-8192")

    # Retrieve similar statements
    docs = vector_db.similarity_search(new_claim, k=k)

    # Build context from similar statements
    context = "\n\n".join([
        f"""Statement: {doc.page_content}
Label: {doc.metadata['label']}
Speaker: {doc.metadata['speaker']} ({doc.metadata['speaker_party']} - {doc.metadata['speaker_state']})
Subject: {doc.metadata['subject']}
Prior Ratings: True={doc.metadata['before_true']}, False={doc.metadata['before_false']}, Half-True={doc.metadata['before_half_true']}, Pants-on-Fire={doc.metadata['before_pants_on_fire']}
Context: {doc.metadata['context']}"""
        for doc in docs
    ])

    # Add new claim metadata to the prompt
    claim_info = ""
    if claim_metadata:
        claim_info = f"""
New Claim Metadata:
Speaker: {claim_metadata.get('speaker')} ({claim_metadata.get('speaker_party')} - {claim_metadata.get('speaker_state')})
Subject: {claim_metadata.get('subject')}
Context: {claim_metadata.get('context')}
"""

    # Prompt
    prompt = PromptTemplate.from_template("""You are a fake news detector. Based on the following examples and metadata, classify the new claim with 4 line justification.

Examples:
{context}

{claim_info}
New Claim:
"{query}"

Answer only 'True' or 'False' with a brief justification.
""")

    query = prompt.format(context=context, claim_info=claim_info, query=new_claim)

    response = llm.invoke(input=query)

    return response.content





In [34]:
import pandas as pd

# Load LIAR dataset
df = pd.read_csv("train.tsv", sep='\t', header=None, names=[
    "id",                # 0
    "label",             # 1
    "statement",         # 2
    "subject",           # 3
    "speaker",           # 4
    "speaker_title",     # 5
    "speaker_state",     # 6
    "speaker_party",     # 7
    "before_true",       # 8
    "before_false",      # 9
    "before_barely_true",# 10
    "before_half_true",  # 11
    "before_mostly_true",# 12
    "before_pants_on_fire", # 13
    "context"            # 14
])

# Map 6-class labels to binary
label_map = {
    "true": "true",
    "mostly-true": "true",
    "half-true": "true",
    "barely-true": "false",
    "false": "false",
    "pants-fire": "false"
}
df["binary_label"] = df["label"].map(label_map)
df["statement_text"] = df["statement"]
# Save only needed columns
data = df[["statement_text", "binary_label"]]


In [35]:
# Example usage:
# Set your API key
API_KEY = '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'

row = data.iloc[3]
claim = data.iloc[0]["statement_text"]
label = data.iloc[0]["binary_label"]

print(claim, "  ", label)

result = detect_fake_news(
    new_claim=claim,
    embedding=embedding,
    vector_db_path="liar_vector_db",
    api_key=API_KEY,
    k=3,
    claim_metadata=row.to_dict()
)
print(result)

Says the Annies List political group supports third-trimester abortions on demand.    false
Answer: False

Justification: The metadata does not indicate the speaker or the context, but the statement is identical to one of the examples provided, which was labeled as False.


In [40]:
import os
import pandas as pd

# Select row and get claim + label
row = data.iloc[343]
claim = row["statement_text"]
label = row["binary_label"]


# Run fake news detection
result = detect_fake_news(
    new_claim=label,
    embedding=embedding,
    vector_db_path="liar_vector_db",
    api_key=API_KEY,
    k=3,
    claim_metadata=row.to_dict()
)

print(result)

# Path to the results CSV
output_csv = "fake_news_results.csv"

# Create result row
new_entry = {
    "claim": "claim",
    "label": label,
    "justification": result.strip()
}

**False**

Justification:
The claim "gold was found in epita college" lacks any credible source or evidence to support it. Additionally, a quick search did not yield any information about Epita College having any connection to gold deposits or discoveries. The claim appears to be unfounded and lacks any basis in reality, which is a characteristic of "pants-fire" claims.


In [41]:
# If the file exists, append; otherwise, create with headers
if os.path.exists(output_csv):
    results_df = pd.read_csv(output_csv)
    results_df = pd.concat([results_df, pd.DataFrame([new_entry])], ignore_index=True)
else:
    results_df = pd.DataFrame([new_entry])

# Save updated results
results_df.to_csv(output_csv, index=False)