In [36]:
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  # this is also the default, it can be omitted
  base_url = "https://api.perplexity.ai"
  )
deployment="sonar"

In [2]:
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
    return file_content

file_content = load_file(file_path = 'GIEC_REPORT.txt')

In [3]:
# Function to create chunks of fixed size, regardless of words

def fixed_chunk(content,chunk_size):
    chunks = []
    length_file = len(file_content)
    for index in range(int(length_file/chunk_size)):
        chunk = file_content[index*chunk_size:(index+1)*chunk_size]
        chunks.append(chunk)
    return chunks

fixed_chunk(content = file_content, chunk_size = 50)

['ipcc INERCOMRANMENTAL PANEL ON climate change CLIM',
 'ATE CHANGE 2023 Synthesis Report Summary for Polic',
 'ymakers A Report of the Intergovernmental Panel on',
 ' Climate Change D WMO UNEP\n\n\n\nCLIMATE CHANGE 2023 ',
 'Synthesis Report Summary for Policymakers Edited b',
 'y Hoesung Lee Chairman IPCC The Core Writing Team ',
 'Synthesis Report IPCC Jose Romero Head, Technical ',
 'Support Unit IPCC Core Writing Team Hoesung Lee (C',
 'hair), Katherine Calvin (USA), Dipak Dasgupta (Ind',
 'ia/USA), Gerhard Krinner (France/Germany), Aditi M',
 'ukherji (India), Peter Thorne (Ireland/United King',
 'dom), Christopher Trisos (South Africa), Jose Rome',
 'ro (Switzerland), Paulina Aldunce (Chile), Ko Barr',
 'ett (USA), Gabriel Blanco (Argentina), William W. ',
 'L. Cheung (Canada), Sarah L. Connors (France/Unite',
 'd Kingdom), Fatima Denton (The Gambia), Aida Diong',
 'ue-Niang (Senegal), David Dodman (Jamaica/United K',
 'ingdomNetherlands), Matthias Garschagen (Germany),',
 ' Oli

In [4]:
# Function to clean text 

def clean_txt(text):
    cleaned_text = text.replace("\n", " ") # remove new lines
    cleaned_text = cleaned_text.replace("  ", " ") #remove double spaces
    return cleaned_text


# Function to create chunks of sentences with a max length size

def semantic_chunks(text, target_size):
    sentences = text.split('.')
    chunks, current_chunk = [], []
    current_length = 0

    for sentence in sentences:
        if current_length + len(sentence) <= target_size:
            current_chunk.append(sentence)
            current_length += len(sentence)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

chunks = semantic_chunks(file_content, 500)
chunks

['',
 'ipcc INERCOMRANMENTAL PANEL ON climate change CLIMATE CHANGE 2023 Synthesis Report Summary for Policymakers A Report of the Intergovernmental Panel on Climate Change D WMO UNEP\n\n\n\nCLIMATE CHANGE 2023 Synthesis Report Summary for Policymakers Edited by Hoesung Lee Chairman IPCC The Core Writing Team Synthesis Report IPCC Jose Romero Head, Technical Support Unit IPCC Core Writing Team Hoesung Lee (Chair), Katherine Calvin (USA), Dipak Dasgupta (India/USA), Gerhard Krinner (France/Germany), Aditi Mukherji (India), Peter Thorne (Ireland/United Kingdom), Christopher Trisos (South Africa), Jose Romero (Switzerland), Paulina Aldunce (Chile), Ko Barrett (USA), Gabriel Blanco (Argentina), William W',
 ' L  Cheung (Canada), Sarah L',
 ' Connors (France/United Kingdom), Fatima Denton (The Gambia), Aida Diongue-Niang (Senegal), David Dodman (Jamaica/United KingdomNetherlands), Matthias Garschagen (Germany), Oliver Geden (Germany), Bronwyn Hayward (New Zealand), Christopher Jones (United

In [12]:
import pandas as pd
data = pd.DataFrame(data=chunks, columns=['text_original'])
data['text_cleaned'] = data['text_original'].apply(clean_txt)
data

Unnamed: 0,text_original,text_cleaned
0,,
1,ipcc INERCOMRANMENTAL PANEL ON climate change ...,ipcc INERCOMRANMENTAL PANEL ON climate change ...
2,"L Cheung (Canada), Sarah L","L Cheung (Canada), Sarah L"
3,"Connors (France/United Kingdom), Fatima Dento...","Connors (France/United Kingdom), Fatima Dento..."
4,"L Otto (United Kingdom/Germany), Minal Pathak...","L Otto (United Kingdom/Germany), Minal Pathak..."
...,...,...
128,Annual wettest-day precipitation change Annua...,Annual wettest-day precipitation change Annua...
129,Projected (a) annual maximum daily temperatur...,Projected (a) annual maximum daily temperatur...
130,Standard deviation is a widely used metric in...,Standard deviation is a widely used metric in...
131,2) Climate Change Impacts and Climate-Related ...,2) Climate Change Impacts and Climate-Related ...


In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

THRESHOLD = 0.2

model = SentenceTransformer("google/embeddinggemma-300m")


In [13]:

def knowledge_database_creation(file_content):
    text_vector = semantic_chunks(file_content, 500)
    data = pd.DataFrame(data=text_vector, columns=['text_original'])
    data['text_cleaned'] = data['text_original'].apply(clean_txt)
    data['text_embedding'] = data['text_cleaned'].apply(model.encode)
    return data

knowledge_database = knowledge_database_creation(file_content)
knowledge_database

Unnamed: 0,text_original,text_cleaned,text_embedding
0,,,"[-0.20974083, -0.010558142, 0.016756052, 0.003..."
1,ipcc INERCOMRANMENTAL PANEL ON climate change ...,ipcc INERCOMRANMENTAL PANEL ON climate change ...,"[-0.05934814, 0.031217434, 0.014482567, 0.1036..."
2,"L Cheung (Canada), Sarah L","L Cheung (Canada), Sarah L","[-0.16852814, -0.021272032, 0.018463343, 0.003..."
3,"Connors (France/United Kingdom), Fatima Dento...","Connors (France/United Kingdom), Fatima Dento...","[-0.069903485, 0.017567385, 0.031000912, 0.027..."
4,"L Otto (United Kingdom/Germany), Minal Pathak...","L Otto (United Kingdom/Germany), Minal Pathak...","[-0.09554054, 0.003522916, 0.017366618, 0.0513..."
...,...,...,...
128,Annual wettest-day precipitation change Annua...,Annual wettest-day precipitation change Annua...,"[-0.012093302, 0.027562045, 0.0052514174, 0.07..."
129,Projected (a) annual maximum daily temperatur...,Projected (a) annual maximum daily temperatur...,"[-0.030373925, 0.03135759, 0.029551886, 0.0796..."
130,Standard deviation is a widely used metric in...,Standard deviation is a widely used metric in...,"[-0.018170446, 0.055054322, 0.0035435043, 0.02..."
131,2) Climate Change Impacts and Climate-Related ...,2) Climate Change Impacts and Climate-Related ...,"[-0.02391352, 0.024457606, 0.04017558, 0.08228..."


In [23]:
# Create a search index and reranking

from sklearn.neighbors import NearestNeighbors

embeddings = knowledge_database['text_embedding'].tolist()

nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(embeddings)

distances, indices = nbrs.kneighbors(embeddings)

knowledge_database['indices'] = indices.tolist()
knowledge_database['distances'] = distances.tolist()

knowledge_database

Unnamed: 0,text_original,text_cleaned,text_embedding,indices,distances
0,,,"[-0.20974083, -0.010558142, 0.016756052, 0.003...","[0, 2, 59, 19, 45]","[0.0, 0.6827952970690118, 0.7487983828041609, ..."
1,ipcc INERCOMRANMENTAL PANEL ON climate change ...,ipcc INERCOMRANMENTAL PANEL ON climate change ...,"[-0.05934814, 0.031217434, 0.014482567, 0.1036...","[1, 17, 23, 21, 85]","[0.0, 0.6050331501932041, 0.6826149912509397, ..."
2,"L Cheung (Canada), Sarah L","L Cheung (Canada), Sarah L","[-0.16852814, -0.021272032, 0.018463343, 0.003...","[2, 0, 59, 10, 57]","[0.0, 0.6827952970690118, 0.7460568891255738, ..."
3,"Connors (France/United Kingdom), Fatima Dento...","Connors (France/United Kingdom), Fatima Dento...","[-0.069903485, 0.017567385, 0.031000912, 0.027...","[3, 9, 4, 5, 8]","[0.0, 0.6565222022935341, 0.6727821563125954, ..."
4,"L Otto (United Kingdom/Germany), Minal Pathak...","L Otto (United Kingdom/Germany), Minal Pathak...","[-0.09554054, 0.003522916, 0.017366618, 0.0513...","[4, 10, 8, 5, 11]","[0.0, 0.6401450513280752, 0.6462015939295254, ..."
...,...,...,...,...,...
128,Annual wettest-day precipitation change Annua...,Annual wettest-day precipitation change Annua...,"[-0.012093302, 0.027562045, 0.0052514174, 0.07...","[128, 126, 27, 116, 33]","[0.0, 0.7012276093490585, 0.7576487775722293, ..."
129,Projected (a) annual maximum daily temperatur...,Projected (a) annual maximum daily temperatur...,"[-0.030373925, 0.03135759, 0.029551886, 0.0796...","[129, 127, 70, 124, 114]","[0.0, 0.7633592065064285, 0.7772017305410005, ..."
130,Standard deviation is a widely used metric in...,Standard deviation is a widely used metric in...,"[-0.018170446, 0.055054322, 0.0035435043, 0.02...","[130, 85, 114, 61, 91]","[0.0, 0.7530619084656806, 0.7673685245943401, ..."
131,2) Climate Change Impacts and Climate-Related ...,2) Climate Change Impacts and Climate-Related ...,"[-0.02391352, 0.024457606, 0.04017558, 0.08228...","[131, 107, 121, 85, 120]","[0.0, 0.5899185883970794, 0.5945843795932313, ..."


In [57]:
def cosine_similarity(a, b):
    if len(a) > len(b):
        b = np.pad(b, (0, len(a) - len(b)), 'constant')
    if len(a) < len(b):
        a = np.pad(a, (0,len(b) - len(a)), 'constant')
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [63]:


def chatbot(query):

    query_embedding = model.encode(query)

    knowledge_database['similarity'] = knowledge_database['text_embedding'].apply(lambda x: cosine_similarity(x,query_embedding))

    context = knowledge_database.sort_values(by="similarity", ascending=False).head(3)['text_cleaned'].tolist()

    display(knowledge_database.sort_values(by="similarity", ascending=False))

    system_prompt = f"""
    You are an AI assistant that answers to questions about the 2023 GIEC Report.
    The following relevant information has been retrieved from the 2023 GIEC Report: 
    {"".join(context)}
    Answer to the user question by only using the above information.
    Answer with 2 bullet points with short sentences of maximum 200 characters.
    If you do not know the answer, say you do not know.
    """

    print(system_prompt)

    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ]

    # use chat completion to generate a response
    response = client.chat.completions.create(
        model=deployment,
        max_tokens=800,
        messages=messages
    )

    
    return response.choices[0].message.content

query = '7degC ?'

chatbot(query)

Unnamed: 0,text_original,text_cleaned,text_embedding,indices,distances,similarity
0,,,"[-0.20974083, -0.010558142, 0.016756052, 0.003...","[0, 2, 59, 19, 45]","[0.0, 0.6827952970690118, 0.7487983828041609, ...",0.843295
59,"acidification, sea level rise and salinizatio...","acidification, sea level rise and salinizatio...","[-0.12083647, -0.00027591625, -0.004141687, 0....","[59, 2, 0, 45, 49]","[0.0, 0.7460568891255738, 0.7487983828041609, ...",0.740727
2,"L Cheung (Canada), Sarah L","L Cheung (Canada), Sarah L","[-0.16852814, -0.021272032, 0.018463343, 0.003...","[2, 0, 59, 10, 57]","[0.0, 0.6827952970690118, 0.7460568891255738, ...",0.716049
45,6 billion people live in contexts that are hig...,6 billion people live in contexts that are hig...,"[-0.10764667, 0.041722216, 0.047197677, 0.0054...","[45, 121, 47, 63, 78]","[0.0, 0.7017535840929491, 0.7584016134481995, ...",0.600942
121,"4 With further warming, every region is projec...","4 With further warming, every region is projec...","[-0.06570289, 0.012309538, -0.00021501878, 0.0...","[121, 131, 120, 51, 85]","[0.0, 0.5945843795932313, 0.6544299076417796, ...",0.582307
...,...,...,...,...,...,...
69,1 Progress in adaptation planning and implemen...,1 Progress in adaptation planning and implemen...,"[-0.06164551, 0.03222882, 0.041117266, 0.07775...","[69, 74, 21, 61, 75]","[0.0, 0.8036118564478827, 0.8162465765362387, ...",0.288042
48,Hundreds of local losses of species have been...,Hundreds of local losses of species have been...,"[-0.048287995, 0.01851485, -0.006699248, 0.059...","[48, 42, 55, 108, 79]","[0.0, 0.7758087040504821, 0.8035087924479364, ...",0.270375
72,Adaptation options such as disaster risk mana...,Adaptation options such as disaster risk mana...,"[-0.062308557, 0.055753157, 0.034870613, 0.054...","[72, 68, 75, 69, 74]","[0.0, 0.8093629721266958, 0.8415921940120822, ...",0.266112
9,"Armour (USA), Birgit Bednar-Friedl (Austria),...","Armour (USA), Birgit Bednar-Friedl (Austria),...","[-0.02968663, 0.021927161, 0.052483086, 0.0712...","[9, 3, 4, 5, 11]","[0.0, 0.6565222022935341, 0.7170055587405989, ...",0.262613



    You are an AI assistant that answers to questions about the 2023 GIEC Report.
    The following relevant information has been retrieved from the 2023 GIEC Report: 
     acidification, sea level rise and salinization (2 1 L Cheung (Canada), Sarah L
    Answer to the user question by only using the above information.
    Answer with 2 bullet points with short sentences of maximum 200 characters.
    If you do not know the answer, say you do not know.
    


'7 degrees Celsius equals 44.6 degrees Fahrenheit using the formula °F = °C × (9/5) + 32[1].  \n\nThis conversion is commonly used for temperature measurements, especially in weather and science contexts[1].'