In [1]:
country = "Belgium"
path2SP = "/Users/ctoruno/OneDrive - World Justice Project/EU Subnational"

## Libraries and API key

In [2]:
import os
import math
import time
import json
import pandas as pd
from dotenv import load_dotenv
import google.generativeai as genai
import prompt_templates_summarization as pts
from google.generativeai.types import HarmCategory, HarmBlockThreshold

In [3]:
load_dotenv()
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

## Reading the data

In [4]:
def gatherFiles(country, p):
    """
    This function takes a country as input and returns a list with all the news articles associated to that specific pillar.
    """
    data_path = f"{path2SP}/EU-S Data/Automated Qualitative Checks/Data/data-summarization/{country}/pillar_{p}"
    sets = [pd.read_parquet(f"{data_path}/{x}") for x in os.listdir(data_path)]
    pillar_data = pd.concat(sets)
    pillar_data["associated_pillar"] = f"Pillar {p}"
    print(f"Pillar {p}: {len(pillar_data)} articles")

    return pillar_data

In [5]:
country_data = [gatherFiles(country, p) for p in range(1,9)]

Pillar 1: 4007 articles
Pillar 2: 2341 articles
Pillar 3: 583 articles
Pillar 4: 2911 articles
Pillar 5: 1331 articles
Pillar 6: 625 articles
Pillar 7: 1695 articles
Pillar 8: 3620 articles


## Defining prompts

In [6]:
prompt_system = """
You are a specialized assistant whose role is to meticulously read and analyze a list of brief news articles and provide a summary of the most important topics related to a specific theme.  I will provide you with further details about the news article summaries and the specific theme that I would like you to focus on. You will have to carefully read the information I will provide to you and identify the most relevant issues or events related to the theme that I will specify. You will have to use your knowledge on politics, law, and social sciences to sucessfully perform this task. Try to provide a limited list of topics short enough for a reader to grasp the full picture of the Rule of Law in a given country. Ideally, I would like a maximum of 20 events but feel free to provide less topics if the list of summaries does not cover that many events.
"""

prompt_template = """
We will now work with the {pillar_name} theme, which encompasses the following aspects:
{pillar_bullets}

The individual articles that we have at hand are the following:
{summaries}

All articles mentioned above have been classified to have a {impact_score} impact for the Rule of Law in {country}.

Taking into account the theme and articles that I provided above, please come up with a summary of the most important topics related to this theme. Make sure to include references to the most relevant events or issues covered in the articles in each topic. Please use the theme aspects to determine how relevant is each one of the events or issues that you identified in the news articles. Use your knowledge on the specified country and focus on why these events could have a {impact_score} impact when elaborating your summary.

When performing this task, please take into account the following things:
- Please use the thematic aspects provided above to identify the most relevant issues or events.
- Feel free to group multiple articles depending on the issue that they are covering. Avoid repeating events.
- Limit your list to a MAXIMUM of 20 topics. Try to focus ONLY in the most important ones.
- The description of each event or issue should be between 100 and 2000 words. Please include references to specific events narrated in the list of news articles that I provided. Please do not include references to events that were NOT MENTIONED in the list of news articles that was provided. Also, please use keywords to refer to specific articles, DO NOT use numbers for references.
"""

## Getting formatted prompts

In [7]:
def split_summaries(summaries):
    """
    This function takes a list of summaries and provides a text compiling all of them but taking into account the 
    token limit for using the Gemini 1.5 Flash
    """

    idx = 0
    segments = [[]]
    total_count = 0

    for text in summaries:
        text_length = len(text.split())

        if total_count + text_length < 950000:      # The limit is 1.048 Million tokens, I'm leaving a marging of error
            segments[idx].append(text)
            total_count = total_count + text_length
        else:
            segments.append([])
            idx = idx + 1
            total_count = 0
            segments[idx].append(text)
            total_count = total_count + text_length

    outcome = ["- "+"\n- ".join(segment) for segment in segments]

    return outcome


In [8]:
# Replacing impact scores fore strings
impact_dict = {
    5 : "Very Positive",
    4 : "Positive",
    3 : "Neutral",
    2 : "Negative",
    1 : "Very Negative",
    0 : "Undefined"
}
for p in range(0,8):
    country_data[p]["impact_score"] = country_data[p].impact_score.replace(impact_dict)

In [9]:
prompts = {}
tokens_counter = 0

for p in range(1,9):
    prompts.update(
        {f"Pillar {p}" : {}}
    )
    
    for impact_score in [impact for score, impact in impact_dict.items() if score != 0]:

        # Subsetting data
        pillar_data = country_data[p-1].copy()
        data_subset = (
            pillar_data.copy()
            .loc[pillar_data["impact_score"] == impact_score]
        )

        # Getting news summaries with URL reference
        # data_subset["summary_linked"] = data_subset["summary"].str.cat(data_subset["link"], sep = "/nURL: ")
        # article_list   = data_subset["summary_linked"].to_list()
        article_list   = data_subset["summary"].to_list()
        limited_chunks = split_summaries(article_list)      # We need to split chunks larger than 1 million tokens

        # Tokens counter
        tokens_counter = tokens_counter + sum([len(x.split()) for x in limited_chunks])

        # Formatting prompts
        formatted_prompts = [
            prompt_template.format_map(
                {
                    "pillar_name"    : pts.pillar_names[str(p)],
                    "pillar_bullets" : pts.pillar_bullets[str(p)],
                    "country"        : country,
                    "summaries"      : x,
                    "impact_score"   : impact_score
                }
            )

            for x in limited_chunks
        ]
        prompts[f"Pillar {p}"].update({f"{impact_score}" : formatted_prompts})       

In [10]:
for pillar, score_list in prompts.items():
    for score, np in score_list.items():
        if len(np) > 1:
            print(f"Pillar: {pillar} --- {score} --- Total Prompts: {len(np)}")

In [11]:
# # Previewing prompts
# pillar = 4
# print(
#     prompts[f"Pillar {pillar}"]["Very Positive"][0]
#     prompts[f"Pillar {pillar}"]["Positive"][0]
#     prompts[f"Pillar {pillar}"]["Neutral"][0]
#     prompts[f"Pillar {pillar}"]["Negative"][0]
#     prompts[f"Pillar {pillar}"]["Very Negative"][0]
# )

## Sending calls to Gemini API

In [12]:
generation_config = {
  "temperature": 0.25,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

safety_settings = {
  HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
  HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
}



In [13]:
def send_call(message, context = prompt_system, gconfig = generation_config, sconfig = safety_settings):
    
    model = genai.GenerativeModel(
        model_name         = "gemini-1.5-pro-exp-0827",
        generation_config  = gconfig,
        safety_settings    = sconfig,
        system_instruction = context
    )
    chat_session = model.start_chat(
        history = []
    )

    response = chat_session.send_message(message)

    return(response.text)

In [None]:
pillar_summaries = {}
for pillar, sentiments in prompts.items():
    print(pillar)
    pillar_summaries[pillar] = {}
    for sentiment, p in sentiments.items():
        print("-------")
        print(sentiment)
        response = send_call(p[0])
        pillar_summaries[pillar][sentiment] = response
        pause_time = (math.ceil(len(p[0].split())/32000)+1.5)*60
        print(f"Pause time: {pause_time}")
        time.sleep = pause_time
    print("===================")

In [None]:

with open(f"{path2SP}/EU-S Data/Automated Qualitative Checks/Data/data-summarization-1/{country.lower()}.json", "w") as file:
    json.dump(pillar_summaries, file, indent=4)