# Single document topic suggestions

This notebook looks at topic suggestion for documents, trying both the first few pages of a document and its description.

**The first few pages** is better, as for UNFCCC documents descriptions are prefilled.


In [1]:
%load_ext autoreload
%autoreload 2

In [22]:
import json
import os
import sys
from collections import Counter
from string import Template
from pathlib import Path

import psycopg2
import boto3
import pandas as pd
from tqdm.auto import tqdm

from dotenv import load_dotenv, find_dotenv

tqdm.pandas()

sys.path.append("../..")

from src.online.inference import get_llm
from src.controllers.LibraryManager import LibraryManager

In [3]:
load_dotenv(find_dotenv(), override=True)

True

In [8]:
if "LABS_RDS_DB_CREDS" in os.environ:
    secret = os.environ['LABS_RDS_DB_CREDS']
else:
    # getting the secrets from the parameter store relating to the RDS database
    session = boto3.Session()
    ssm = session.client('ssm')
    response = ssm.get_parameter(Name="/RAG/LABS_RDS_DB_CREDS", WithDecryption=True)
    secret = response['Parameter']['Value']
    
conn = psycopg2.connect(**json.loads(secret))


In [9]:
# having a look at the tables in the database

cur = conn.cursor()

cur.execute("""SELECT * FROM information_schema.tables WHERE table_schema = 'public';""")

tables = cur.fetchall()

for table in tables:
    print(table)

('rag-labs', 'public', 'dbquery', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None)
('rag-labs', 'public', 'feedback', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None)
('rag-labs', 'public', 'qapair', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None)
('rag-labs', 'public', 'notebook', 'BASE TABLE', None, None, None, None, None, 'YES', 'NO', None)


checking the table headers for both of these:

In [10]:
cur.execute("""SELECT DISTINCT document_id FROM dbquery;""")

generation_db_response = cur.fetchall()

documents_in_db = [i[0] for i in generation_db_response]

print(f"Number of documents in the database: {len(documents_in_db)}")

Number of documents in the database: 649


## Load documents from s3 bucket

In [11]:
# Note from Kalyan: I still can't get s3 to work in a notebook, so the code loads from 
# the local filesystem for now. The s3 dir below contains the data you need to get this
# running.
# documents_s3_path = "s3://project-rag/data/cpr_embeddings_output"

documents_path = Path("~/Documents/cpr/data/s3-buckets-prod/opensearch_input_05_20_2024/").expanduser()

assert documents_path.exists()

In [12]:
documents_by_id = {}

for document_id in tqdm(documents_in_db):
    try:
        documents_by_id[document_id] = json.loads((documents_path / f"{document_id}.json").read_text())
    except Exception as e:
        print(f"Failed to load {document_id}: {e}")

  0%|          | 0/649 [00:00<?, ?it/s]

In [13]:
def get_description(document: dict) -> str:
    return document["document_metadata"]["description"]

def get_page_text(document: dict, pages: list[int]) -> list[str]:
    """Get text from specific pages, or all pages if the document is an HTML document."""
    
    if document["pdf_data"] is not None:
        blocks = document["pdf_data"]["text_blocks"]
        return [" ".join(block["text"]) for block in blocks if block["page_number"] in pages]
    elif document["html_data"] is not None:
        blocks = document["html_data"]["text_blocks"]
        return [" ".join(block["text"]) for block in blocks]
    else:
        return []
    
    
get_page_text(documents_by_id[documents_in_db[0]], [0, 1, 2])    

['FEDERAL MINISTRY OF ENVIRONMENT DEPARTMENT OF CLIMATE CHANGE',
 'NATIONAL CLIMATE CHANGE POLICY',
 'FOR NIGERIA',
 '2021 - 2030',
 'FORWARD',
 'Climate change is perhaps the biggest challenge facing humanity. It is complex and dynamic and requires dimensional and multi-sectoral mitigation and adaptation initiatives within a dynamic policy framework to properly tackle it. Government recognizes this and is committee to tackling any presumed threat to its national sustainable development.',
 'Since the development of the National Climate Change Policy and Response Strategy (NCCPRS) in 2012, the global discourse on climate change has evolved leading to the adoption of new initiatives that have been domesticated to guide national response to reducing the impact and adapting to the challenge.',
 'Dr. Mohammad Mahmood Abubakar Honourable Minister Federal Ministry of Environment',
 "Specifically, a major milestone for Nigeria's effective response to the challenges of climate change, is the s

In [34]:
document_text_data = [
    {
    "document_id": document_id,
    "description": get_description(documents_by_id[document_id]),
    "start_text": " ".join(get_page_text(documents_by_id[document_id], [0, 1, 2])) 
    }
    for document_id in documents_in_db
]

document_text_df = pd.DataFrame(document_text_data)

document_text_df.head()

Unnamed: 0,document_id,description,start_text
0,CCLW.executive.1515.5045,The National Policy on Climate Change is a str...,FEDERAL MINISTRY OF ENVIRONMENT DEPARTMENT OF ...
1,CCLW.executive.1280.2594,Grenada's National Energy Policy (GNEP) lays d...,GRENADA THE NATIONAL ENERGY POLICY OF GRENADA ...
2,CCLW.executive.1793.4130,This document was implemented through Decision...,
3,CCLW.executive.10532.6491,The Millennium Challenge Corporation Climate A...,Climate Action Plan Tina Neumann Chief Sustain...
4,CCLW.executive.4924.2051,This decree sets the methods for managing poll...,"DECREE Nº01 -397/P-RM OF SEPT 06, 2001 SETTING..."


In [15]:
empty_text = (document_text_df["start_text"].apply(len) == 0).sum()
print(f"Number of documents with empty text: {empty_text}/{len(document_text_df)}")

document_text_df = document_text_df[document_text_df["start_text"].apply(len) > 0]
print(f"Keeping {len(document_text_df)} nonempty documents.")

document_text_df["start_text_n_words"] = document_text_df["start_text"].apply(lambda x: len(x.split()))
MIN_WORDS_IN_FIRST_PAGES = 100
print(f"Number of documents with less than {MIN_WORDS_IN_FIRST_PAGES} words in the first few pages: {(document_text_df['start_text_n_words'] < MIN_WORDS_IN_FIRST_PAGES).sum()}")

document_text_df = document_text_df[document_text_df["start_text_n_words"] >= MIN_WORDS_IN_FIRST_PAGES]
print(f"Keeping {len(document_text_df)} documents with {MIN_WORDS_IN_FIRST_PAGES} words in the first few pages.")

Number of documents with empty text: 119/649
Keeping 530 nonempty documents.
Number of documents with less than 100 words in the first few pages: 59
Keeping 471 documents with 100 words in the first few pages.


## Come up with a method for generating topics from response text and try it in a template

`climate_specific` below is the best template

In [35]:
PROMPT_TEMPLATES = {
    # "simple": Template("Provide two of the main topics in this text which are interesting to provoke further research. Each topic should be a maximum of $max_words words. Return the topics with a newline character between them. Return the topics only. TEXT: $text"),
    # "climate_policy": Template("Provide two of the main topics in this text related to climate policy. Each topic should be a maximum of $max_words words. Return the topics with a newline character between them. Return the topics only. TEXT: $text"),
    "climate_specific": Template("Provide two of the main topics in this text related to climate policy. The topics should be specific to the text, and more specific than general terms like 'climate policy'. Each topic should be a maximum of $max_words words. Return the topics with a newline character between them. Return the topics only. TEXT: $text"),
}

In [36]:
nemo_model = get_llm(
    type="vertexai",
    model="mistral-nemo",
    unfiltered=False,
)

nemo_model("What is the capital of France?")

I0000 00:00:1724772952.074019 102953146 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1724772952.338828 102953146 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
  warn_deprecated(
I0000 00:00:1724772952.604536 102953146 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1724772952.605394 102953146 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


' Paris'

In [37]:
SAMPLE_SIZE = 50
test_sample = document_text_df.sample(SAMPLE_SIZE, random_state=42)

test_sample.head(2)

Unnamed: 0,document_id,description,start_text
636,CCLW.executive.9540.rtl_169,These Regulations establish a regime for limit...,CANADA CONSOLIDATION Regulations Limiting Carb...
220,CCLW.executive.11100.6443,<p>This plan notably tackles the adverse impac...,"Ministry of Environment, Energy and Climate Ch..."


In [38]:
MAX_WORDS = 4

def get_nemo_response(text: str, template: Template, max_words: int = 4) -> str:
    max_words_in_text = 2000
    if len(text) > max_words_in_text:
        text = " ".join(text.split()[:max_words_in_text])
    
    try:
        return nemo_model(template.substitute(text=text, max_words=max_words))
    except:
        return ""

for column_name in {"description", "start_text"}:
    print(f"Processing {column_name}...")
    for template_name, template in PROMPT_TEMPLATES.items():
        test_sample[f"nemo_response_{column_name}_{template_name}"] = test_sample[column_name].progress_apply(
            lambda x: get_nemo_response(x, template, MAX_WORDS)
        )

Processing start_text...


  0%|          | 0/50 [00:00<?, ?it/s]

Processing description...


  0%|          | 0/50 [00:00<?, ?it/s]

In [39]:
with pd.option_context("display.max_colwidth", 100):
    display(test_sample.sort_values("start_text_n_words"))

KeyError: 'start_text_n_words'

### See what proportion of samples returned a valid response

In [None]:
nemo_response_cols = [col for col in test_sample.columns if "nemo_response" in col]

def response_is_valid(response: str) -> bool:
    return len(response.strip().split("\n")) == 2

for col in nemo_response_cols:
    test_sample[f"{col}_valid"] = test_sample[col].apply(lambda x: response_is_valid(x))

valid_cols = [col for col in test_sample.columns if "_valid" in col]
for col in valid_cols:
    print(col)
    print(test_sample[col].value_counts())

nemo_response_start_text_climate_specific_valid
True     49
False     1
Name: nemo_response_start_text_climate_specific_valid, dtype: int64
nemo_response_description_climate_specific_valid
True    50
Name: nemo_response_description_climate_specific_valid, dtype: int64


In [None]:
test_sample[test_sample["nemo_response_start_text_climate_specific_valid"] == False]

Unnamed: 0,document_id,description,start_text,start_text_n_words,nemo_response_start_text_climate_specific,nemo_response_description_climate_specific,nemo_response_start_text_climate_specific_valid,nemo_response_description_climate_specific_valid
239,CCLW.executive.10393.5088,This plan defines the government's plan for th...,å›½åŠ¡é™¢åŠžå…¬åŽ…å…³äºŽå °å &#39;è¦ ç´ å¸‚åœº...,1587,,Carbon pricing\nGreen accounting,False,True


In [40]:
all_suggestions = test_sample["nemo_response_start_text_climate_specific"].apply(lambda t: t.strip().split("\n")).tolist()

all_suggestions = [item for sublist in all_suggestions for item in sublist]

Counter(all_suggestions).most_common(10)

[('Carbon pricing', 15),
 ('Climate finance', 9),
 ('Renewable energy', 5),
 ('Energy efficiency', 4),
 ('Climate change', 4),
 ('Disaster risk reduction', 2),
 ('Carbon dioxide emissions', 1),
 ('Natural gas-fired generation', 1),
 ('Coastal management', 1),
 ('Disaster resilience', 1)]