In [20]:
import os
import json
import time
import pandas as pd
from dotenv import load_dotenv
import prompt_templates_classification as ptc
from langchain.schema import BaseOutputParser
from langchain.prompts.chat import ChatPromptTemplate
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory
)
from google.generativeai.types import BlockedPromptException

## Loading API key

In [21]:
# Loading API KEY from environment
load_dotenv()
GoogleAI_key = os.getenv("googleAI_API_key")
os.environ['GOOGLE_API_KEY'] = GoogleAI_key

## Loading data

In [22]:
country_data = pd.read_parquet("../data/data-extraction-1/ready4class/Ireland_translated.parquet.gzip")
country_data.head(5)

Unnamed: 0,id,link,domain_url,published_date,title,description,content,language,is_opinion,country,title_trans,description_trans,content_trans
0,f17e7c2d673b055957afa10158678f2f,https://www.independent.ie/world-news/middle-e...,independent.ie,2024-01-08 12:31:02,Iranian police whip woman 74 times for refusin...,'A woman in Iran was whipped 74 times by the p...,'A woman in Iran was whipped 74 times by the p...,en,False,Ireland,Iranian police whip woman 74 times for refusin...,'A woman in Iran was whipped 74 times by the p...,'A woman in Iran was whipped 74 times by the p...
1,6b81f20ed916ccb2034dd515959e9edd,https://www.independent.ie/regionals/wicklow/a...,independent.ie,2024-01-08 11:44:49,Bomb squad called as hand grenade found on Wic...,The army bomb disposal unit was called to Arkl...,Home\n \n \n>\n \n \n \n \nRegionals\n \n \n>\...,en,False,Ireland,Bomb squad called as hand grenade found on Wic...,The army bomb disposal unit was called to Arkl...,Home\n \n \n>\n \n \n \n \nRegionals\n \n \n>\...
2,1f501542d5b25ae29e9de9bb73da80ec,https://www.independent.ie/world-news/court-re...,independent.ie,2024-01-08 11:11:26,Court restores life sentences for 11 men who g...,India's top court has restored life prison sen...,A woman protests against remission of sentence...,en,False,Ireland,Court restores life sentences for 11 men who g...,India's top court has restored life prison sen...,A woman protests against remission of sentence...
3,bc20a80b6cd427b376f53c30b6a4244d,https://www.independent.ie/irish-news/judge-co...,independent.ie,2024-01-08 10:31:46,Judge convicted of sex abuse of six young men ...,A Circuit Court judge who resigned from the be...,Gerard O'Brien pictured at his home in Thurles...,en,False,Ireland,Judge convicted of sex abuse of six young men ...,A Circuit Court judge who resigned from the be...,Gerard O'Brien pictured at his home in Thurles...
4,a172ef12da89dad091b248021460e2e9,https://www.independent.ie/regionals/cork/news...,independent.ie,2024-01-08 10:26:31,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...,en,False,Ireland,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...


## Defining Chain

In [23]:
safety_settings = {
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
}

In [24]:
class JSONOutputParser(BaseOutputParser):
    def parse(self, text: str):
        """
        Parse the output of an LLM call to a valid JSON format.
        """
        return json.loads(text.replace('```json', '').replace('```', ''), strict=False)

In [76]:
def classify_article(headline, summary, body, stage_1 = True, relation = None):
    """
    This function takes a headline, a summary, and the content of a news article and it sends a call to Google's Gemini
    to classify the article. There are two different classifications: Stage 1 and Stage 2. If stage_1 is set to TRUE, then
    the call to the model will try to answer the following question: Is this news article related or unrelated to the Rule of Law?
    If stage_1 is set to FALSE, then the call to the model will try to rate how closely related is the news article to each
    one of the eight pillars of the Rule of Law.
    """

    # Defining the prompt according to which stage are we calling the function for
    if stage_1 == True:
        system_prompt = ptc.context_stage_1
        human_prompt  = ptc.instructions_stage_1
    else:
        system_prompt = ptc.context_stage_2
        human_prompt  = ptc.instructions_stage_2

    # Setting up the Prompt Template
    chat_prompt = ChatPromptTemplate.from_messages([
                    ("system", system_prompt),
                    ("human", human_prompt),
                ])

    # Defining our chain
    chain_gemini = chat_prompt | ChatGoogleGenerativeAI(model = "gemini-pro",
                                                        temperature     = 0.2, 
                                                        safety_settings = safety_settings,
                                                        convert_system_message_to_human = True) | JSONOutputParser()
    
    # For Stage 2, we don't want to pass articles that were already classified as "UNRELATED", so we pre-defined the outcome
    if stage_1 == False and "Related" not in relation:
        outcome = "Unrelated"

    else:
        try: 
            llm_response = chain_gemini.invoke({
                "headline": headline,
                "summary" : summary,
                "body"    : body,
            })
            status = True
            time.sleep(1)   # We need to slow down the calls. given that the Gemini API has a limit of 60 calls per second

        # The API can still block some of our prompts due to undefined reasons. Sadly, we can't do anything about it, so we
        # predefine the outcome    
        except BlockedPromptException:
            print("BLOCKED")
            status = False
                
        # We use the STATUS variable to throw an outcome to our call depending if our prompt was blocked or not and
        # on the stage we are calling the function for
        if status == True:
            if stage_1 == True:
                outcome = f"{llm_response['rol_related']} <> {llm_response['country']}"

            else:
                outcome = json.dumps(llm_response["pillars_relation"])
        else:
            outcome = "Blocked Prompt"

    return outcome

In [62]:
country_data.shape

(48409, 13)

In [None]:
for batch_number in range(1,11):
    
    print(f"Sending batch number: {batch_number}")

    # Subsetting data
    starting_row = (batch_number-1)*5000
    print(starting_row)
    end_row      = starting_row+5000
    print(end_row)
    batch_subset = country_data.copy().iloc[starting_row:end_row]

    # Applying classifiers
    batch_subset["rol_related"] = batch_subset.apply(lambda row: classify_article(
        row["title_trans"], 
        row["description_trans"], 
        row["content_trans"], 
        stage_1 = True
    ), axis = 1)

    batch_subset["pillars_score"] = batch_subset.apply(lambda row: classify_article(
        row["title_trans"], 
        row["description_trans"], 
        row["content_trans"], 
        relation = row["rol_related"],
        stage_1  = False
    ), axis = 1)


In [77]:
batch_subset = country_data.copy().iloc[0:10]

In [81]:
# Applying classifiers
batch_subset["rol_related"] = batch_subset.apply(lambda row: classify_article(
    row["title_trans"], 
    row["description_trans"], 
    row["content_trans"], 
    stage_1 = True
), axis = 1)

batch_subset["pillars_score"] = batch_subset.apply(lambda row: classify_article(
    row["title_trans"], 
    row["description_trans"], 
    row["content_trans"], 
    relation = row["rol_related"],
    stage_1  = False
), axis = 1)

In [82]:
batch_subset

Unnamed: 0,id,link,domain_url,published_date,title,description,content,language,is_opinion,country,title_trans,description_trans,content_trans,rol_related,pillars_score
0,f17e7c2d673b055957afa10158678f2f,https://www.independent.ie/world-news/middle-e...,independent.ie,2024-01-08 12:31:02,Iranian police whip woman 74 times for refusin...,'A woman in Iran was whipped 74 times by the p...,'A woman in Iran was whipped 74 times by the p...,en,False,Ireland,Iranian police whip woman 74 times for refusin...,'A woman in Iran was whipped 74 times by the p...,'A woman in Iran was whipped 74 times by the p...,Related <> Iran,Unrelated
1,6b81f20ed916ccb2034dd515959e9edd,https://www.independent.ie/regionals/wicklow/a...,independent.ie,2024-01-08 11:44:49,Bomb squad called as hand grenade found on Wic...,The army bomb disposal unit was called to Arkl...,Home\n \n \n>\n \n \n \n \nRegionals\n \n \n>\...,en,False,Ireland,Bomb squad called as hand grenade found on Wic...,The army bomb disposal unit was called to Arkl...,Home\n \n \n>\n \n \n \n \nRegionals\n \n \n>\...,Unrelated <> Ireland,"[{""1. Constraints on Government Powers"": 0}, {..."
2,1f501542d5b25ae29e9de9bb73da80ec,https://www.independent.ie/world-news/court-re...,independent.ie,2024-01-08 11:11:26,Court restores life sentences for 11 men who g...,India's top court has restored life prison sen...,A woman protests against remission of sentence...,en,False,Ireland,Court restores life sentences for 11 men who g...,India's top court has restored life prison sen...,A woman protests against remission of sentence...,Related <> India,Unrelated
3,bc20a80b6cd427b376f53c30b6a4244d,https://www.independent.ie/irish-news/judge-co...,independent.ie,2024-01-08 10:31:46,Judge convicted of sex abuse of six young men ...,A Circuit Court judge who resigned from the be...,Gerard O'Brien pictured at his home in Thurles...,en,False,Ireland,Judge convicted of sex abuse of six young men ...,A Circuit Court judge who resigned from the be...,Gerard O'Brien pictured at his home in Thurles...,Justice <> Ireland,"[{""1. Constraints on Government Powers"": 10}, ..."
4,a172ef12da89dad091b248021460e2e9,https://www.independent.ie/regionals/cork/news...,independent.ie,2024-01-08 10:26:31,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...,en,False,Ireland,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...,Unrelated <> Ireland,"[{""1. Constraints on Government Powers"": 1}, {..."
5,468440cb1acde5c256fdc4f8ac1a6465,https://www.independent.ie/regionals/cork/news...,independent.ie,2024-01-08 10:26:31,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...,en,False,Ireland,Cork Councillors demand reliable ATM following...,The non-availability of the AIB ATM for large ...,The non-availability of the AIB ATM for large ...,Unrelated <> Ireland,"[{""1. Constraints on Government Powers"": 0}, {..."
6,9caf64325d854a7b80de6c3eb145c926,https://www.independent.ie/irish-news/taoiseac...,independent.ie,2024-01-08 09:17:04,Taoiseach ‘expects arrests in coming weeks' fo...,Taoiseach Leo Varadkar said he expects arrests...,Taoiseach Leo Varadkar. Photo: Brian Lawless/P...,en,False,Ireland,Taoiseach ‘expects arrests in coming weeks' fo...,Taoiseach Leo Varadkar said he expects arrests...,Taoiseach Leo Varadkar. Photo: Brian Lawless/P...,Related <> Ireland,Unrelated
7,be152467c788f3a521bf38ec8709f910,https://www.independent.ie/irish-news/gerry-th...,independent.ie,2024-01-08 07:21:48,Gerry ‘The Monk' Hutch all smiles as he rings ...,Gerry ‘The Monk' Hutch smiles for the camera a...,Gerry ‘The Monk' Hutch smiles for the camera a...,en,False,Ireland,Gerry ‘The Monk' Hutch all smiles as he rings ...,Gerry ‘The Monk' Hutch smiles for the camera a...,Gerry ‘The Monk' Hutch smiles for the camera a...,Related <> Ireland,Unrelated
8,7b41c90642f2e3dc8c217c4cf223c100,https://www.independent.ie/regionals/wicklow/a...,independent.ie,2024-01-08 07:00:00,NTA's ‘watery' response to Wicklow train overc...,Councillors have vented their frustration at t...,Education\n\nPrivate school fees rising due to...,en,False,Ireland,NTA's ‘watery' response to Wicklow train overc...,Councillors have vented their frustration at t...,Education\n\nPrivate school fees rising due to...,Unrelated <> Ireland,"[{""1. Constraints on Government Powers"": 1}, {..."
9,df6674acea0b1ea630fae485eaacfbeb,https://www.independent.ie/regionals/kerry/tra...,independent.ie,2024-01-08 06:15:00,Tralee housing development appealed to An Bórd...,"Impacting privacy, adding to traffic congestio...","Impacting privacy, adding to traffic congestio...",en,False,Ireland,Tralee housing development appealed to An Bórd...,"Impacting privacy, adding to traffic congestio...","Impacting privacy, adding to traffic congestio...",Unrelated <> Ireland,"[{""1. Constraints on Government Powers"": 7}, {..."
