In [1]:
import os
import ast
import json
import time
import pandas as pd
from dotenv import load_dotenv
import prompt_templates_summarization as pts
from langchain.schema import BaseOutputParser
from langchain.prompts.chat import ChatPromptTemplate
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory
)
from langchain_openai import ChatOpenAI
from json.decoder import JSONDecodeError
from google.generativeai.types import BlockedPromptException
from google.generativeai.types.generation_types import StopCandidateException

In [2]:
country = "Croatia"
path2SP = "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational"

## Loading API key

In [3]:
load_dotenv()
GoogleAI_key = os.getenv("googleAI_API_key")
OpenAI_key   = os.getenv("openai_key")
os.environ['GOOGLE_API_KEY'] = GoogleAI_key
os.environ['OPENAI_API_KEY'] = OpenAI_key

## Reading the data

In [4]:
def gatherFiles(country, p):
    """
    This function takes a country as input and returns a list with all the news articles associated to that specific pillar.
    """
    data_path = f"{path2SP}/EU-S Data/Automated Qualitative Checks/Data/data-summarization/{country}/pillar_{p}"
    sets = [pd.read_parquet(f"{data_path}/{x}") for x in os.listdir(data_path)]
    pillar_data = pd.concat(sets)
    pillar_data["associated_pillar"] = f"Pillar {p}"
    print(f"Pillar {p}: {len(pillar_data)} articles")

    return pillar_data

In [5]:
country_data = [gatherFiles(country, p) for p in range(1,9)]

Pillar 1: 4370 articles
Pillar 2: 3061 articles
Pillar 3: 528 articles
Pillar 4: 2803 articles
Pillar 5: 1534 articles
Pillar 6: 784 articles
Pillar 7: 1583 articles
Pillar 8: 4383 articles


## Saving gathered data

In [6]:
target_vars = [
    "id", "link", "domain_url", "published_date",
    "title_trans", "description_trans", "content_trans", "summary", "impact_score",
    "pillar_1", "pillar_2", "pillar_3", "pillar_4", "pillar_5", "pillar_6", "pillar_7", "pillar_8",
    "associated_pillar"
]
master = pd.concat(country_data).loc[:,target_vars]
master.to_parquet(f"{path2SP}/EU-S Data/Automated Qualitative Checks/Data/data-summarization/{country}/{country}_master.parquet.gzip", compression="gzip")

## Defining chain

In [7]:
safety_settings = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
}

In [8]:
class JSONOutputParser(BaseOutputParser):
    def parse(self, text: str):
        """
        Parse the output of an LLM call to a valid JSON format.
        """
        return json.loads(text.replace('```json', '').replace('```', ''), strict=False)

In [9]:
def summarize_pillar(summaries, pillar):
    """
    This function takes a list of summaries and it sends a call to Google's Gemini asking for a list of
    important events to take into account for a specific pillar of the Rule of Law.
    """
    
    idx = str(pillar)

    # Setting up the Prompt Template
    chat_prompt = ChatPromptTemplate.from_messages([
                    ("human", pts.pillar_summary_prompt)
                ])

    # Defining our chain
    chain_gemini = chat_prompt | ChatGoogleGenerativeAI(model = "gemini-pro",
                                                        temperature = 0.25, 
                                                        safety_settings = safety_settings,
                                                        convert_system_message_to_human = True) | JSONOutputParser()
    
    try: 
        llm_response = chain_gemini.invoke({
            "summaries"      : summaries,
            "pillar_name"    : pts.pillar_names[idx],
            "pillar_bullets" : pts.pillar_bullets[idx]
        })
        status = True
        time.sleep(1)   # We need to slow down the calls. given that the Gemini API has a limit of 60 calls per second

    # The API can still block some of our prompts due to undefined reasons. Sadly, we can't do anything about it, so we
    # predefine the outcome    
    except (BlockedPromptException, StopCandidateException):
        print("Prompt BLOCKED")
        status = False
    
    except JSONDecodeError:
        print("Decode error... trying again...")
        try: 
            llm_response = chain_gemini.invoke({
                "summaries"      : summaries,
                "pillar_name"    : pts.pillar_names[idx],
                "pillar_bullets" : pts.pillar_bullets[idx]
            })
            status = True
            time.sleep(1)

        except JSONDecodeError:
            print("Decode error... trying again...")
            try: 
                llm_response = chain_gemini.invoke({
                    "summaries"      : summaries,
                    "pillar_name"    : pts.pillar_names[idx],
                    "pillar_bullets" : pts.pillar_bullets[idx]
                })
                status = True
                time.sleep(1)

            except JSONDecodeError:
                print("Failed. Skipping...")
                status = False

    # We use the STATUS variable to throw an outcome to our call depending if our prompt was blocked or not
    if status == True:
        outcome = [llm_response["list_of_events"]]

    else:
        outcome = "Skipped list"

    return outcome

In [10]:
def summarize_pillar_GPT(summaries, pillar):
    """
    This function takes a list of summaries and it sends a call to OpenAI's GPT asking for a list of
    important events to take into account for a specific pillar of the Rule of Law.
    """
    
    idx = str(pillar)

    # Setting up the Prompt Template
    chat_prompt = ChatPromptTemplate.from_messages([
                    ("system", pts.pillar_summary_context),
                    ("human", pts.pillar_summary_instructions)
                ])

    # Defining our chain
    chain_gpt = chat_prompt | ChatOpenAI(model_name="gpt-4-turbo-preview", temperature=0.2) | JSONOutputParser()
    
    try: 
        llm_response = chain_gpt.invoke({
            "summaries"      : summaries,
            "pillar_name"    : pts.pillar_names[idx],
            "pillar_bullets" : pts.pillar_bullets[idx]
        })
        status = True
    
    except JSONDecodeError:
        print("Decode error... trying again...")
        try: 
            llm_response = chain_gpt.invoke({
                "summaries"      : summaries,
                "pillar_name"    : pts.pillar_names[idx],
                "pillar_bullets" : pts.pillar_bullets[idx]
            })
            status = True

        except JSONDecodeError:
            print("Decode error... trying again...")
            try: 
                llm_response = chain_gpt.invoke({
                    "summaries"      : summaries,
                    "pillar_name"    : pts.pillar_names[idx],
                    "pillar_bullets" : pts.pillar_bullets[idx]
                })
                status = True

            except JSONDecodeError:
                print("Failed. Skipping...")
                status = False

    # We use the STATUS variable to throw an outcome to our call depending if our prompt was blocked or not
    if status == True:
        outcome = [llm_response["list_of_events"]]

    else:
        outcome = "Skipped list"

    return outcome

## Sending calls

In [11]:
def split_summaries(summaries, llm = "GPT"):
    """
    This function takes a list of summaries and provides a text compiling all of them but taking into account the 
    token limit for using the Gemini Pro 1.0 API
    """

    idx = 0
    segments = [[]]
    total_count = 0

    if llm == "GPT":
        tkn_limit = 98500
    if llm == "Gemini":
        tkn_limit = 22500

    for text in summaries:
        text_length = len(text.split())

        if total_count + text_length < tkn_limit:
            segments[idx].append(text)
            total_count = total_count + text_length
        else:
            segments.append([])
            idx = idx + 1
            total_count = 0
            segments[idx].append(text)
            total_count = total_count + text_length

    outcome = ["- "+"\n- ".join(segment) for segment in segments]

    return outcome


In [12]:
summaries = {}
for p in range(1,9):
    print(f"PILLAR {p}")
    article_list   = country_data[p-1]["summary"].to_list()
    limited_chunks = split_summaries(article_list)
    print(f"Total number of chunks: {len(limited_chunks)}")
    bullet_points  = [summarize_pillar_GPT(x, p) for x in limited_chunks]
    pillar_summ    = "/n".join([item for sublist_1 in bullet_points for sublist_2 in sublist_1 for item in sublist_2])
    summaries.update({f"Pillar {p}" : pillar_summ})

PILLAR 1
Total number of chunks: 5
PILLAR 2
Total number of chunks: 3
PILLAR 3
Total number of chunks: 1
PILLAR 4
Total number of chunks: 3
PILLAR 5
Total number of chunks: 2
PILLAR 6
Total number of chunks: 1
PILLAR 7
Total number of chunks: 2
PILLAR 8
Total number of chunks: 4


## Saving summaries as JSON files

In [13]:
file_path = f"{path2SP}/EU-S Data/Automated Qualitative Checks/Data/data-summarization/Overview/{country}_overviewSummaries.json"
with open(file_path, "w") as json_file:
    json.dump(summaries, json_file)

In [14]:
summaries

{'Pillar 1': "The Croatian Parliament debates the dismissal of Economy Minister Davor Filipović after allegations of corruption involving his advisor offering advertising space from state-owned companies to a local television network in exchange for payment./nCroatian police arrest nine individuals wanted by Greece for their alleged involvement in fan riots, highlighting concerns about the independence of the judiciary and the effectiveness of oversight institutions in international legal cooperation./nThe Croatian government proposes amendments to the Law on Foreigners to improve the accommodation conditions for foreign workers, emphasizing the need for better living standards and protection of workers' rights./nThe Croatian Football Association and FARE network condemn the display of the Ustasha flag at a football match, demonstrating efforts to combat racism and discrimination in sports and uphold the principles of fair play and equality./nThe Croatian government allocates grants to