In [1]:
import pandas as pd
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv()

df = pd.read_csv('data/Cleantech Media Dataset/cleantech_media_dataset_v2_2024-02-23.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,date,author,content,domain,url
0,1280,Qatar to Slash Emissions as LNG Expansion Adva...,2021-01-13,,"[""Qatar Petroleum ( QP) is targeting aggressiv...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
1,1281,India Launches Its First 700 MW PHWR,2021-01-15,,"[""• Nuclear Power Corp. of India Ltd. ( NPCIL)...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
2,1283,New Chapter for US-China Energy Trade,2021-01-20,,"[""New US President Joe Biden took office this ...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
3,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,,"[""The slow pace of Japanese reactor restarts c...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...
4,1285,NYC Pension Funds to Divest Fossil Fuel Shares,2021-01-25,,"[""Two of New York City's largest pension funds...",energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...


In [2]:
import ast

df['content'] = df['content'].apply(ast.literal_eval)
df['content'] = df['content'].apply(lambda x: [str(i) for i in x])
df['content'] = df['content'].apply(lambda x: '\n\n'.join(x))

df['content'].head()

0    Qatar Petroleum ( QP) is targeting aggressive ...
1    • Nuclear Power Corp. of India Ltd. ( NPCIL) s...
2    New US President Joe Biden took office this we...
3    The slow pace of Japanese reactor restarts con...
4    Two of New York City's largest pension funds s...
Name: content, dtype: object

In [4]:
from src.embedding_strategy import EmbeddingStrategy
strategy = EmbeddingStrategy.get_default_strategy()

documents = []
for index, row in tqdm(df.iterrows()):
    content = row['content']

    metadata = {
        "url": row['url'],
        "domain": row['domain'],
        "title": row['title'],
        "author": row['author'],
        "date": row['date']
    }
    
    cleaned_content = strategy.processor.clean(content)
    row_docs = strategy.processor.chunk(content, metadata)
    
    documents += row_docs

9593it [00:01, 8539.38it/s]


In [4]:

# strategy.vector_store.add_documents(documents, verbose=True)
# 
# strategy.vector_store.similarity_search("The company is also aiming to reduce gas flaring?")

100%|██████████| 2/2 [32:46<00:00, 983.45s/it] 


[Document(page_content='The company in 2020 announced its intention to reduce operational GHG emissions intensity by 35% to 45% by 2030, a revision from the previous goal of 5% to 15%, and to reach net zero emissions by 2050.\n\nThe path to net zero will involve tools such as emissions offsets and carbon capture, utilization and sequestration, Lance said.\n\nWithin the ESG initiatives, Lance and Sheffield also stressed the importance of curbing routine natural gas flaring, which remains a stubborn challenge in the Permian despite progress made on reducing flaring intensity.\n\nOther measures such as electrifying fracture fleets, and better monitoring of methane leaks, will be crucial as well, they said.', metadata={'date': '2021-03-05', 'domain': 'naturalgasintel', 'title': 'ConocoPhillips, Pioneer Natural CEOs Preach Discipline as Permian Activity Rebounds', 'url': 'https://www.naturalgasintel.com/conocophillips-pioneer-natural-ceos-preach-discipline-as-permian-activity-rebounds/'}),


In [5]:
from src.generator import Generator
import os

gen = Generator(openai_api_key=os.getenv("OPENAI_API_KEY"), vectorstore=strategy.vector_store, retriever=strategy.retriever)
gen.ask("Who was in Paris?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Answer: World leaders, ministers, negotiators, representatives from civil society, businesses, international organizations, and the media were in Paris for the COP21 where the Paris Agreement was signed.


'World leaders, ministers, negotiators, representatives from civil society, businesses, international organizations, and the media were in Paris for the COP21 where the Paris Agreement was signed.'

In [6]:
from langchain_openai import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

# Metadata schema based on the values on the CSV
metadata_field_info = [
    AttributeInfo(
        name="url",
        description="Url of the document",
        type="string",
    ),
    AttributeInfo(
        name="title",
        description="Title of the document",
        type="string",
    ),
    AttributeInfo(
        name="date",
        description="Date of the document",
        type="string",
    ),
    AttributeInfo(
        name="author",
        description="Author of the document",
        type="string",
    ),
    AttributeInfo(
        name="domain",
        description="Domain of the document, closely related to the source of the document",
        type="string",
    ),
]
document_content_description = "Article content"

# Configure retriver
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, strategy.vector_store.vector_store, document_content_description, metadata_field_info, verbose=True
)


In [7]:
retriever.invoke("Who was in Paris?")

[Document(page_content="By clicking `` Allow All '' you agree to the storing of cookies on your device to enhance site navigation, analyse site usage and support us in providing free open access scientific content. More info.\n\nParis takes the crown as Europe’ s ‘ greenest’ capital, with the highest number of green spaces per 100,000 people. Luxembourg City and Dublin take second and third respectively.\n\nNew research has revealed the greenest European capitals, with Paris coming out on top.\n\nThe study by drinking straws specialist Drinking Straw analysed the number of parks, gardens, wildlife areas, forests, playgrounds and bodies of water compared to the size of the population to see which areas have the most green spaces per 100,000 people.\n\nIt found that France’ s capital of Paris is the greenest of any capital, with 325 total green spaces made up of 171 parks, 127 gardens, one wildlife area, 16 playgrounds and ten bodies of water. When accounting for population, it has 7.62 

In [ ]:
# Based on rag evaluation set 
result = retriever.get_relevant_documents("Who was in Paris?")
print(result[0].page_content)
print(result[0].metadata)
