In [1]:
path = "data-download/Climate-ADAPT case studies"

In [2]:
import os
import json

def process_json_files(directory, callback):
    # Ensure the directory exists
    if not os.path.isdir(directory):
        raise ValueError(f"The directory '{directory}' does not exist or is not a directory.")
    
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".json"):  # Filter for JSON files
            filepath = os.path.join(directory, filename)
            if os.path.isfile(filepath):  # Ensure it's a file
                # print(f"Processing JSON file: {filename}")
                try:
                    with open(filepath, 'r') as file:
                        callback(file)
                except Exception as e:
                    print(f"Error processing {filename}: {e}")

In [3]:
documents = []

def read_content(fd):
    data = json.load(fd)
    documents.append(data['fields'].get('content', ''))

process_json_files(path, read_content)
len(documents)

1025

In [4]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

In [5]:
from bertopic import BERTopic
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
import os
import openai

client = openai.OpenAI(
  api_key="",
  base_url="https://api.together.xyz/v1",
)

# cluster_model = KMeans(n_clusters=20)
cluster_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model, embedding_model=embedding_model, hdbscan_model=cluster_model)
topics, probs = topic_model.fit_transform(documents, embeddings)

In [6]:
for topic_id in topic_model.topic_representations_.keys():
    topics_data = topic_model.topic_representations_[topic_id]
    topic = ", ".join([x[0] for x in topics_data])
    print(topic)

climate, heatwave, temperatures, temperature, heatwaves, heat, health, contingency, adaptation, benefits
municipalities, climate, westphalia, stakeholder, governance, roadmapping, adaptation, workshops, regions, regional
translation, translated, berlin, geographic, european, europe, kassel, transnational, language, köln
crops, crop, soil, cultivated, farming, climate, cultivation, farmer, farm, wheat
municipality, rainwater, municipal, environment, municipalities, ecological, groundwater, precipitation, austria, climate
climate, paris, sustainability, parisians, bonds, bond, budget, adaptation, france, greening
wetlands, wetland, ecosystem, attica, ecosystems, climate, drought, environmental, conservation, biodiversity
slovakia, watershed, košice, runoff, ecosystems, restoration, groundwater, ecosystem, landscapes, revitalization
barcelona, sustainability, gentrification, environmental, municipal, infrastructure, biodiversity, greening, neighbourhoods, greenery
alps, climate, switzerla

In [7]:
prompt = """Topic: {topic}

Content of documents: 
{documents}

Generated questions:
"""

max_prompt = 10000 # 4096
# model = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
# model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo

def make_llm_call(sys_message, text, model):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": sys_message},
            {"role": "user", "content": text}
        ],
        stream=False
    )
    # import pdb; pdb.set_trace()
    return response.choices[0].message.content.split('\n')

def get_questions(topic_model, topic_id, sys_message, model):
    topics_data = topic_model.topic_representations_[topic_id]
    topic = ", ".join([x[0] for x in topics_data])
    docs = topic_model.representative_docs_[topic_id]
    local_prompt = prompt[:]
    local_prompt = local_prompt.replace("{topic}", topic)
    max_docs_chars = max_prompt - len(sys_message) - len(local_prompt.replace("{documents}", ""))
    docs = '\n\n'.join(docs)[:max_docs_chars]
    local_prompt = local_prompt.replace("{documents}", docs)
    # print(local_prompt)
    questions = [q.strip() for q in make_llm_call(sys_message, local_prompt, model) if q.strip()]
    return (questions, docs, topic)

In [8]:
sys_message = """I'm building a dataset of representative questions that a website visitor might ask. 
We're using our documents to build a set of "topics keywords" that represent our documents. 
I will provide a topic and the text for the document, your task is to generate a question 
that a user might ask, related to that topic, that may find its answer in our document. 
Make the question as human as possible and keep it short and not too specific, even if it's not comprehensive, 
as the users don't like to type a lot. 
It is important to keep the questions centered around the given topic keywords. 
Don't generate questions that are really specific to a place or project.
Generate maximum 5 questions. 
The answer should be simple text, no introduction, just one question per line. 
Don't use dashes at the beginning of lines. 
On the last line, extract a topic that summarizes the provided keywords, in the format: Topic: <topic>
"""

#Don't generate questions that are really specific to a place or project.

In [9]:
model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
dataset = []

for topic_id in topic_model.topic_representations_.keys():
    questions, docs, topic_keywords = get_questions(topic_model, topic_id, sys_message, model)
    qs = []
    for line in questions:
        if line.startswith("Topic: "):
            topic = line.replace("Topic: ", "")
        else:
            qs.append(line)
    record = {"keywords": topic_keywords, "questions": qs, "topic": topic}
    dataset.append(record)
    print("Topic id: %s" % topic_id)
    print("Topic keywords: %s" % topic_keywords)
    print("Topic: %s" % topic)
    
    for q in qs:
        print(q)
    print("\n")

Topic id: -1
Topic keywords: climate, heatwave, temperatures, temperature, heatwaves, heat, health, contingency, adaptation, benefits
Topic: Heatwave Contingency Planning
What are the health risks associated with heatwaves?
How can heatwaves be prevented or mitigated?
What are the benefits of having a heatwave contingency plan?
How do heatwaves affect vulnerable populations such as the elderly?
What measures can be taken to reduce the impact of heatwaves on public health?


Topic id: 0
Topic keywords: municipalities, climate, westphalia, stakeholder, governance, roadmapping, adaptation, workshops, regions, regional
Topic: Climate Adaptation in Municipalities
How can small rural municipalities adapt to climate change?
What is the role of multilevel governance in climate adaptation?
How can regional stakeholders work together to address climate change?
What methods can be used to develop practical climate adaptation plans?
How can municipalities integrate climate change adaptation into t

In [11]:
import json
with open("datasets/Climate-ADAPT case studies-2.json", "w") as f:
    json.dump(dataset, f)