In [None]:
import os
from dotenv import load_dotenv
import langchain
import langsmith
import chromadb
import asyncio
import string
import random
import re

load_dotenv()


In [None]:
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

### **Text Extraction**

In [None]:
from pdfminer.high_level import extract_text

def extract_and_clean_pdf(file_path, min_chunk_length=10):
    
    try:
        raw_text = extract_text(file_path)
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return []

    text = re.sub(r'(\w)-\s*\n\s*(\w)', r'\1\2', raw_text)

    # 3. Split the text into potential chunks based on double newlines
    raw_chunks = text.split('\n\n')
    
    cleaned_chunks = []
    for chunk in raw_chunks:
        # 4. Clean up each individual chunk
        # Collapse single newlines and multiple spaces
        cleaned_chunk = re.sub(r'\s*\n\s*', ' ', chunk).strip()
        cleaned_chunk = re.sub(r'\s+', ' ', cleaned_chunk)

        # 5. Filter out unwanted chunks based on generic rules

        # a) Filter out short chunks that are likely headers, footers, or noise
        if len(cleaned_chunk) < min_chunk_length:
            continue

        # b) Filter out chunks that resemble table of contents entries (e.g., "Introduction ..... 5")
        if re.search(r'\.{5,}|_{5,}', cleaned_chunk):
            continue
            
        # c) Filter out chunks that are likely just page numbers or simple headers/footers
        # This checks if a chunk has a very low ratio of alphabetic characters
        if len(cleaned_chunk) > 0 and sum(c.isalpha() for c in cleaned_chunk) / len(cleaned_chunk) < 0.6:
            continue
            
        # d) Filter out common academic/report metadata lines
        if re.match(r'^(DOI|ISBN|ISSN):', cleaned_chunk, re.IGNORECASE):
            continue

        # If the chunk passes all filters, add it to the list
        cleaned_chunks.append(cleaned_chunk)
            
    return cleaned_chunks    

In [None]:
data_path = 'data/bams-d-11-00197.1.pdf'
pdf_text = extract_and_clean_pdf(data_path)
pdf_text[0:100]

### **Text Chunking**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_text(''.join(pdf_text))

### **Text Embeddings and Vector Database**

In [None]:
import uuid
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer


In [None]:
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
chroma_client = chromadb.Client() # Can be swapped for PersistentClient
collection = chroma_client.create_collection(name='weather_client_nomral_documents', embedding_function=embedding_function, get_or_create=True)

In [None]:
ids = [str(uuid.uuid4()) for _ in chunks]
meta_data = [{"chunk_number": chunk_num} for chunk_num in range(len(chunks))]

In [None]:
collection.add(
    ids=ids,
    documents=chunks,
    metadatas=meta_data
)

In [None]:
collection.get(random.choice(ids), include=['embeddings'])

### **Query Refinement**

In [None]:
import os
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.runnables import RunnableLambda
import re

In [None]:
def parse_and_clean_output(text: str) -> list[str]:
    cleaned_text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
    questions = [q.strip() for q in cleaned_text.split("\n") if q.strip()]
    return questions

In [None]:
template = """You are an AI language model assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines.

Original question: {question}"""

prompt_perspectives = PromptTemplate.from_template(template)

In [None]:
HUGGING_FACE_API_TOKEN = os.getenv("HUGGING_FACE_HUB_TOKEN")

llm_endpoint = HuggingFaceEndpoint(
    repo_id=MODEL_ID,
    huggingfacehub_api_token=HUGGING_FACE_API_TOKEN,
    task="conversational" 
)

# 3. Define the LLM using the modern Hugging Face Endpoint
chat_model = ChatHuggingFace(llm=llm_endpoint)

# 4. Create the query-generation pipeline using LCEL (LangChain Expression Language)
generate_queries = (
    prompt_perspectives
    | chat_model
    | StrOutputParser()
    | RunnableLambda(parse_and_clean_output)
)

# 5. Invoke the pipeline
question = "What percentage of coral reefs are projected to decline at a global warming level of 1.5°C?"
response = generate_queries.invoke({"question": question})

print(response)

### **Document Retrival**

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_classic.retrievers import MultiQueryRetriever

In [None]:
langchain_embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
langchain_chroma_store = Chroma(
    client=chroma_client,
    collection_name="weather_client_nomral_documents",
    embedding_function=langchain_embedding_function
)

base_retriever = langchain_chroma_store.as_retriever()
print("LangChain retriever created.")

In [None]:
retriever = MultiQueryRetriever(
    retriever=base_retriever,
    llm_chain=generate_queries
)

question = "What percentage of coral reefs are projected to decline at a global warming level of 1.5°C?"
response_docs = retriever.invoke({"question": question}) # Pass as dict for the prompt

print("\n--- Retrieved Documents ---")
print(response_docs)

### **Ranking of Documents**

In [None]:
from langchain_classic.load import dumps, loads

def rank_documents(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return reranked_results

retrieval_chain = generate_queries | retriever.map() | rank_documents
docs = retrieval_chain.invoke(question)

### **LLM Query**

In [None]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

template = """Answer the following question based on this context:

{context}

Question: {question}
"""

# Create templatized prompt
prompt = PromptTemplate.from_template(template)

# Pass context and question into prompt, then pass prompt to LLM
final_rag_chain = (
    {"context": retrieval_chain, "question": RunnablePassthrough()}
    | prompt
    | chat_model
    | StrOutputParser()
    | RunnableLambda(parse_and_clean_output)
)

final_rag_chain.invoke({"question":question})

### **RAG Evaluation**

In [None]:
from langsmith import Client
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")


dataset_inputs = [
    "What are Climate Normals?",
    "What is the new set of NOAA's climate normals?",
    "What are the overarching goals of NOAA's 1981-2010 U.S. Climate Normals?",
    "What are the three major product lines in the new normals?",
    "What is the history of NOAA's climate normals?",
    "When did NOAA's NCDC become the official archive for weather records?",
    "Who is NCDC's official source for calculations of U.S. normals?",
    "When are climatological standard normals computed?",
    "What are the products included in the 1981-2010 Climate Normals?",
    "What are the temperature-related normals?",
    "What are precipitation-related climate normals?",
    "What is the methodological overview for the 1981-2010 climate normals?",
    "How are higher-quality monthly data achieved in the 1981-2010 normals?",
    "What are quasi normals for short-record stations?"
]

dataset_outputs = [
    {"answer": "Climate normals are 30-year averages of meteorological conditions, such as air temperature and precipitation. They characterize the background state of the climate."},
    {"answer": "The new set of NOAA's climate normals is the 1981-2010 set, which replaces the 1971-2000 normals."},
    {"answer": "The goals include producing high-quality normals for many U.S. stations, being representative of the 1981-2010 period, reflecting station locations and observing practices at the end of 2010, adding new products, developing new statistical techniques, and providing timely access."},
    {"answer": "The three major product lines are temperature-related, precipitation-related, and hourly normals."},
    {"answer": "NOAA's NCDC is responsible for recording U.S. climatic conditions, stemming from the Organic Act of 1890. The WMO set guidelines for 30-year periods, and NOAA has been computing decennial 30-year normals since the 1921-50 period."},
    {"answer": "The Federal Records Act of 1950 established NOAA's NCDC as the official archive for weather records."},
    {"answer": "NOAA's National Climatic Data Center (NCDC) is the official source for calculations of U.S. normals."},
    {"answer": "Climatological standard normals are computed every 30 years as part of an international effort led by the WMO. Standard normals for 1901-30, 1931-60, and 1961-90 have been distributed."},
    {"answer": "Products include station-based temperature, precipitation, snowfall, and snow depth normals at daily, monthly, seasonal, and annual scales, as well as degree days and threshold exceedance frequencies."},
    {"answer": "Temperature-related normals are based on daily observations of maximum temperature (Tmax) and minimum temperature (Tmin). They include normals for Tmax, Tmin, mean temperature (Tavg), diurnal temperature range (DTR), heating degree days (HDDs), cooling degree days (CDDs), and threshold exceedance frequencies."},
    {"answer": "Precipitation-related normals (precipitation, snowfall, snow depth) are based on daily observations. They include monthly, seasonal, and annual averages, month-to-date and year-to-date normals, threshold exceedance frequencies, and percentiles."},
    {"answer": "The values come from the Global Historical Climatology Network-Daily (GHCN-Daily) dataset, which undergoes extensive quality assurance. Data flagged as erroneous are treated as missing. A station needs at least 10 'sufficiently complete' months for each month of the year."},
    {"answer": "The 1981-2010 normals use monthly temperature data (Tmax and Tmin) that undergo robust QA and homogenization using a pairwise comparison technique, which is then passed down to the daily time scale."},
    {"answer": "Quasi normals are estimated normals for active short-record stations (at least 2 years of complete months) that fail the 10-year completeness criterion. They are estimated using linear combinations of normals from neighboring longer-record stations."}
]


client = Client()

dataset = client.create_dataset(
    dataset_name="NOAA Climate Normals Questions",
    description="NOAA Climate Normals questions for RAG pipeline evaluation.",
)

client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

print(f"Successfully created dataset '{dataset.name}' with {len(dataset_inputs)} examples.")
print(f"View it in LangSmith: {dataset.url}")

In [None]:
# --- 4. Define Custom Evaluator ---
def must_mention(run: Run, example: Example) -> dict:
    """
    Checks if the model's output contains any of the required phrases.
    """
    # Get the RAG chain's output
    prediction = run.outputs.get("output") or ""
    
    # Get the "ground truth" required phrases from the dataset
    required = example.outputs.get("must_mention") or []
    
    # Score is 1 (True) if ANY required phrase is in the prediction, 0 (False) otherwise
    score = any(phrase.lower() in prediction.lower() for phrase in required)
    
    return {"key": "must_mention", "score": int(score)}


In [None]:
runner = final_rag_chain 
dataset_name = "NOAA Climate Normals Questions"

def query_wrapper(query_dict: dict) -> dict:
    response = runner.invoke(query_dict) 
    return {"output": response}


evaluators = [must_mention]

print(f"Starting evaluation on dataset: {dataset_name}...")

experiment_results = evaluate(
    query_wrapper,         # The function to test (your RAG chain)
    data=dataset_name,     # The dataset to test against
    evaluators=evaluators, # The list of grading functions
    experiment_prefix="noaa-rag-pipeline", # A name for the test run
    client=client,
)

print("\n--- Evaluation Complete ---")
print(experiment_results)