In [1]:
# packages

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import os
import csv
import json
import re
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import  LlamaForCausalLM, LlamaTokenizer, pipeline

import torch
from torch import cuda, bfloat16
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from sklearn.metrics import classification_report

In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [5]:
os.getcwd()

# go one level up in the directory
huggingface_cache_dir = 'model_mistral'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

## Load the Model from Huggingface

In [None]:
torch.manual_seed(0)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)
# set model access token for huggingface
hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=bfloat16,
    device_map='auto',
    token=hf_token,
    cache_dir=huggingface_cache_dir)
model.eval()

torch.manual_seed(0)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token, cache_dir=huggingface_cache_dir)

print(f"Model loaded on {device}")

In [None]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [9]:
def generated_text_to_df(generated_text):
    # Initialize lists to store extracted data
    article_ids = []
    entity_names = []
    entity_types = []
    quotes = []

    # Join the lines into a single string
    data_str = '\n'.join(generated_text)

    # Define regex patterns to extract the necessary information
    article_id_pattern = re.compile(r'"article_id":\s*"(\d+)"')
    entity_pattern = re.compile(r'\{(.*?)\}', re.DOTALL)
    entity_name_pattern = re.compile(r'"entity_name":\s*"([^"]+)"')
    entity_type_pattern = re.compile(r'"entity_type":\s*"([^"]+)"')
    quote_pattern = re.compile(r'"quote":\s*"([^"]+)"')

    # Split the data string into individual article sections
    # article_sections = re.split(r'\}\s*{', data_str)
    article_sections = re.split(r'(?="article_id")', data_str)

    for article_section in article_sections:
        article_id_match = article_id_pattern.search(article_section)
        if not article_id_match:
            continue
        article_id = article_id_match.group(1)

        # Find all entitys within the article section
        entitys_matches = entity_pattern.findall(article_section)
        if not entitys_matches:
            entity_name = None
            entity_type = None
            quote = None
            article_ids.append(article_id)
            entity_names.append(entity_name)
            entity_types.append(entity_type)
            quotes.append(quote)
        else:
            for entity_match in entitys_matches:
                entity_name_match = entity_name_pattern.search(entity_match)
                entity_type_match = entity_type_pattern.search(entity_match)
                quote_match = quote_pattern.search(entity_match)

                article_ids.append(article_id)
                entity_names.append(entity_name_match.group(1) if entity_name_match else None)
                entity_types.append(entity_type_match.group(1) if entity_type_match else None)
                quotes.append(quote_match.group(1) if quote_match else None)

    # Create DataFrame
    df = pd.DataFrame({
        'article_id': article_ids,
        'entity_name': entity_names,
        'entity_type': entity_types,
        'quotes': quotes,
    })

    return df

In [None]:
def zero_shot_prompt_messages(main_prompt):
    messages = [
        {"role": "user", "content": main_prompt},
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

def count_words(sentences_list):
    # Join the list into a single string
    if sentences_list not in [None, np.nan]:
        full_text = ' '.join(sentences_list)
        # Split the string into words and count them
        word_count = len(full_text.split())
    else:
        word_count = 0
    return word_count

# Retrieve Actor DF

In [None]:
# retrieve researcher annotated articles

df = pd.read_csv('path_to_annotated_articles/reliability_actors_final_cleaned_elif.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
df['article_id'] = df['article_id'].astype(int)

In [None]:
df_topics = pd.read_csv('path_to_annotated_articles/reliability_topics_elif.csv',
                        sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

df_topics['article_id'] = df_topics['article_id'].astype(int)
df_topics = df_topics[df_topics['coder'] == 'Elif Kilik'] 
df_topics = df_topics[['article_id', 'about_covid', 'actors_present']].drop_duplicates()

df_topics.head()

df = pd.merge(df, df_topics, on = 'article_id', how = 'left')

In [None]:
articles_df = pd.read_csv('path_to_annotated_articles/final_nosarticles.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change page_id to integer
articles_df['page_id'] = articles_df['page_id'].astype(int)

articles_df = articles_df[['page_id', 'Text']].drop_duplicates()
# rename page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)

# remove line break
articles_df['Text'] = articles_df['Text'].str.replace('[LINE_BREAK]', '\n ')

In [None]:
df = pd.merge(df, articles_df, on = 'article_id', how = 'left')

In [None]:
# limit the df to the researcher coder Elif Kilik
df = df[df['coder'] == 'Elif Kilik']

# remove if geopolitical entity coded
df = df[df.actor_type != 'Geopolitieke entiteit']

# filter to only covid articles
df = df[df.about_covid == 1]

# if name is not none then actors_present==1
df['actors_present'] = np.where(df['actor_name'].notnull(), 1, 0)

In [16]:
# merge the two dataframes
unique_articles = df[['article_id', 'Text']].drop_duplicates()
unique_articles = unique_articles.sort_values('article_id')
# shuffle the dataframe
unique_articles = unique_articles.sample(frac=1, random_state=0).reset_index(drop=True)
print(len(unique_articles))


76


# Actor Extraction

In [26]:
system_prompt = """
You are a helpful AI assistant. You will be provided with a news article in Dutch and your task is to identify and extract all direct and indirect quotes along with information about the entities who provided these quotes. The entities can be person (individuals or multiple people) or an organization/institution. 
"""

input_prompt = """
Read the following article with the ID {article_id}: {text}
"""

main_prompt = """
### Definitions:
- Direct Quotes: Verbatim statements from the entity, enclosed in quotation marks.
  - Example: "We are pleased that the government has decided to reopen the hospitality industry," said the chairman of the restaurant industry association.
- Indirect Quotes: Paraphrased statements attributed to a entity using signal words, without quotation marks.
  - Example Signal Words: volgens, beweert, zegt, meldt, verklaart, vindt dat, stelt, etc.
  - Example: The ministry announced that the restrictions will be lifted next month.

### Instructions:
1. Take time to understand the context and the statements.
2. Identify and extract all direct and indirect quotes from the article.
3. Provide the following information about the entity of each quote:
   - entity name or description: The name or description of the entity.
   - entity type: Type of the entity which can be either a person/people or an organization/instution.
4. Use the following JSON format for your output. Include one entry per quote. Do not include any additional comments or explanations in the output.

### Example Output (JSON format):
{{
  "article_id": "1234567",
  "quotes": [
    {{
      "quote": "We zijn blij dat de maatregelen versoepeld worden.", zei Rutte.
      "entity_name": "Rutte",
      "entity_type": "person"
    }},
    {{
      "quote": RIVM zegt dat de besmettingscijfers in de komende weken zullen dalen.
      "entity_name": "RIVM",
      "entity_type": "organization"
    }}
  ]
}}
"""


In [27]:
zero_shot_prompt = zero_shot_prompt_messages(system_prompt, input_prompt, main_prompt)
print(zero_shot_prompt)

<s> [INST] 
You are a helpful AI assistant. You will be provided with a news article in Dutch and your task is to identify and extract all direct and indirect quotes along with information about the entities who provided these quotes. The entities can be person (individuals or multiple people) or an organization/institution. 



Read the following article with the ID {article_id}: {text}

### Definitions:
- Direct Quotes: Verbatim statements from the entity, enclosed in quotation marks.
  - Example: "We are pleased that the government has decided to reopen the hospitality industry," said the chairman of the restaurant industry association.
- Indirect Quotes: Paraphrased statements attributed to a entity using signal words, without quotation marks.
  - Example Signal Words: volgens, beweert, zegt, meldt, verklaart, vindt dat, stelt, etc.
  - Example: The ministry announced that the restrictions will be lifted next month.

### Instructions:
1. Take time to understand the context and the 

In [28]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "Text"],
    template=zero_shot_prompt
)

In [29]:
chain_one = LLMChain(llm = llm, prompt = prompt_template)
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'text'], template='<s> [INST] \nYou are a helpful AI assistant. You will be provided with a news article in Dutch and your task is to identify and extract all direct and indirect quotes along with information about the entities who provided these quotes. The entities can be person (individuals or multiple people) or an organization/institution. \n\n\n\nRead the following article with the ID {article_id}: {text}\n\n### Definitions:\n- Direct Quotes: Verbatim statements from the entity, enclosed in quotation marks.\n  - Example: "We are pleased that the government has decided to reopen the hospitality industry," said the chairman of the restaurant industry association.\n- Indirect Quotes: Paraphrased statements attributed to a entity using signal words, without quotation marks.\n  - Example Signal Words: volgens, beweert, zegt, meldt, verklaart, vindt dat, stelt, etc.\n  - Example: The ministry announced that the restrictions 

In [30]:
generated_text_zeroshot = []

In [None]:
%%time
torch.manual_seed(0)
for index, row in unique_articles.iterrows():  
    article_id = row['article_id']
    text = row['Text']

    input_variables = {
            "article_id": article_id,
            "text": text}
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot.append(generated_text)   

# CPU times: user 27min 13s, sys: 3.12 s, total: 27min 16s
# Wall time: 27min 16s  

## Clean and Save the Generated Data

This is a long cleaning process. The LLMs sometimes output actors such as article, article author who are not supposed to be extracted. Cleaning should be done carefully and therefore includes a lot of checks and removing duplicates. 

In [None]:
df_generated = generated_text_to_df(generated_text_zeroshot)
# rename entity_name to source_name and entity_type to source_type
df_generated.rename(columns = {'entity_name': 'source_name', 'entity_type': 'source_type'}, inplace = True)
df_generated['source_name'] = df_generated['source_name'].str.strip()

In [205]:
print(df_generated.article_id.nunique())
print(unique_articles.article_id.nunique())

76
76


In [None]:
# are there duplicates? 
duplicated_df = df_generated[df_generated.duplicated(subset=['article_id', 'source_name', 'quotes'], keep=False)]
print(len(duplicated_df))
# check
# duplicated_df

In [207]:
# drop duplicates
df_generated = df_generated.drop_duplicates(subset=['article_id', 'source_name', 'quotes'], keep='first')

In [208]:
df_generated.source_type.value_counts(dropna=False)

source_type
person                      334
organization                192
group                         8
N/A                           8
person/people                 6
person/organization           6
person/group                  4
government                    3
country                       3
group_of_people               3
not_applicable                2
not_specified                 2
countries                     2
None                          2
city                          2
method                        1
trend                         1
group_of_countries            1
person, organization          1
person_group                  1
source_of_information         1
organization/institution      1
study                         1
person/unspecified            1
document                      1
person / organization         1
event                         1
institution                   1
Name: count, dtype: int64

In [None]:
# check
# df_generated[df_generated['source_type'].isin(['N/A', 'not_applicable', 'not_specified', 'city', 'unknown', 'onbekend', 'unspecified'])]

In [None]:
df_generated.source_name.value_counts(dropna=False)

# check if source name has the following string pieces: niet, onbekend, onbekende, niet bekend, article, artikel
# df_generated[df_generated['source_name'].str.contains(r'niet|onbekend|article|artikel|N/A|specif|unknown|author', case=False, na=False)]

In [None]:

# if source name contains these string pieces then source_name, source_type and quotes are set to None
df_generated.loc[df_generated['source_name'].str.contains(r'niet|onbekend|article|artikel|N/A|specif|unknown|author', case=False, na=False), ['source_name', 'source_type', 'quotes']] = [None, None, None]
df_generated.source_name.value_counts(dropna=False)

In [None]:
duplicated_df = df_generated[df_generated.duplicated(subset=['article_id', 'source_name', 'quotes'], keep=False)]
df_generated = df_generated.drop_duplicates(subset=['article_id', 'source_name', 'quotes'], keep='first')

In [None]:

df_generated.source_type.value_counts(dropna=False)

source_type
person                   326
organization             173
None                      16
group                      8
person/people              6
person/organization        6
person/group               4
country                    3
government                 3
group_of_people            3
countries                  2
city                       2
group_of_countries         1
person, organization       1
method                     1
trend                      1
document                   1
source_of_information      1
person_group               1
study                      1
person / organization      1
event                      1
institution                1
Name: count, dtype: int64

In [213]:
df_generated.source_type.value_counts(dropna=False)
# make source type lowercase, and remove leading and trailing spaces
df_generated['source_type'] = df_generated['source_type'].str.lower()
df_generated['source_type'] = df_generated['source_type'].str.strip()

df_generated.source_type.value_counts(dropna=False)

source_type
person                   326
organization             173
None                      16
group                      8
person/people              6
person/organization        6
person/group               4
country                    3
government                 3
group_of_people            3
countries                  2
city                       2
group_of_countries         1
person, organization       1
method                     1
trend                      1
document                   1
source_of_information      1
person_group               1
study                      1
person / organization      1
event                      1
institution                1
Name: count, dtype: int64

In [215]:
organization_names = ['De Kroon / Chicago Social Club', 'Het kabinet', 'het kabinet', 'het ministerie van Volksgezondheid', 'Tweede Kamer']

df_generated.loc[(df_generated['source_name'].isin(organization_names)), 'source_type'] = 'organization'

In [216]:
# drop if source name is Nederland, Duitsland, Frankrijk en Italië make it none
locations = ['Netherlands', 'China', 'Peking', 'Westerse landen','Nederland, Duitsland, Frankrijk en Italië',
'deze vier landen', 'Guiyang']
df_generated.loc[df_generated.source_name.isin(locations), 'source_name'] = None
df_generated.loc[(df_generated['source_name'].isnull()), 'source_type'] = None
df_generated.loc[(df_generated['source_name'].isnull()), 'quotes'] = None

duplicated_df = df_generated[df_generated.duplicated(subset=['article_id', 'source_name', 'quotes'], keep=False)]
print(len(duplicated_df))
duplicated_df

# drop duplicates
df_generated = df_generated.drop_duplicates(subset=['article_id', 'source_name', 'quotes'], keep='first')

9


In [None]:
# df_generated[~df_generated.source_type.isin(['person', 'organization']) & ~df_generated.source_type.isnull()].source_type.unique()

In [218]:
# wrong types
wrong_types = ['study', 'event', 'document', 'city', 'trend','method']

df_generated.loc[df_generated['source_type'].isin(wrong_types), 'source_name'] = None
df_generated.loc[df_generated['source_type'].isin(wrong_types), 'quotes'] = None
df_generated.loc[df_generated['source_type'].isin(wrong_types), 'source_type'] = None


duplicated_df = df_generated[df_generated.duplicated(subset=['article_id', 'source_name', 'quotes'], keep=False)]
print(len(duplicated_df))
duplicated_df


4


Unnamed: 0,article_id,source_name,source_type,quotes
2,2394513,,,
5,2394513,,,
430,2432686,,,
436,2432686,,,


In [None]:
# df_generated[~df_generated.source_type.isin(['person', 'organization']) & ~df_generated.source_type.isnull()].source_name.unique()

In [220]:
# if source type not in person, organization then it is person
df_generated.loc[~df_generated['source_type'].isin(['person', 'organization']), 'source_type'] = 'person'

In [221]:
duplicated_df = df_generated[df_generated.duplicated(subset=['article_id', 'source_name', 'quotes'], keep=False)]
print(len(duplicated_df))
duplicated_df

# drop duplicates
df_generated = df_generated.drop_duplicates(subset=['article_id', 'source_name', 'quotes'], keep='first')

4


In [222]:
df_generated.source_type.value_counts(dropna=False)

source_type
person          377
organization    178
Name: count, dtype: int64

In [223]:
df_generated[df_generated['source_type'].isnull()]

Unnamed: 0,article_id,source_name,source_type,quotes


In [225]:
# if source_type is person or group of people, change it to Persoon
df_generated.loc[df_generated.source_type.isin(['person']), 'source_type'] = 'Persoon'
df_generated.loc[df_generated.source_type == 'organization', 'source_type'] = 'Organisatie'
df_generated.source_type.value_counts(dropna=False)

source_type
Persoon        377
Organisatie    178
Name: count, dtype: int64

In [None]:
# df_generated[df_generated.source_type.isnull()]['source_name'].value_counts(dropna=False)

In [227]:
df_generated['nr_words_quotes'] = df_generated['quotes'].apply(count_words)

In [228]:
df_generated['nr_words_quotes'].describe()

count    555.000000
mean      75.654054
std       40.919168
min        0.000000
25%       49.000000
50%       70.000000
75%       98.500000
max      262.000000
Name: nr_words_quotes, dtype: float64

In [229]:
df_generated[df_generated['nr_words_quotes']==0].source_name.value_counts(dropna=False)

source_name
None    21
Name: count, dtype: int64

In [None]:
df_generated['article_id'] = df_generated['article_id'].astype(int)
# change column names
df_generated = df_generated.rename(columns={'source_name': 'actor_name', 'source_type': 'actor_type', 'source_function': 'actor_function'})

In [231]:
# limit the article_ids of df_aggregated to the article_ids of df
print(df_generated.shape, df.shape)
df_generated = df_generated[df_generated['article_id'].isin(df['article_id'])]
df = df[df['article_id'].isin(df_generated['article_id'])]
print(df_generated.shape, df.shape)

(555, 5) (303, 88)
(555, 5) (303, 88)


In [None]:
df_generated.to_csv('extracted_quotes_Mistral.csv', sep=';', index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [233]:
def join_unique(values):
    return ' '.join(set(values))

In [None]:
# aggregate the df by article_id, source_name, source_type, source_function and add quotes to each other and sum nr_words_quotes
df_aggregated = df_generated.groupby(['article_id', 'actor_name', 'actor_type']).agg({'quotes': join_unique, 'nr_words_quotes': 'sum'}).reset_index()

In [235]:
print(len(df_aggregated.article_id.unique()))
print(len(df_generated.article_id.unique()))

76
76


In [None]:
# are there any article_ids from df_generated that is not in df_aggregated
df_noactors = df_generated[~df_generated['article_id'].isin(df_aggregated['article_id'])]

In [237]:
# add df_noactors to df_aggregated
df_aggregated = pd.concat([df_aggregated, df_noactors], axis=0)
print(df_aggregated.shape)
df_aggregated = df_aggregated.fillna("")

(314, 5)


In [238]:
df_duplicates = df_aggregated[df_aggregated.duplicated(subset=['article_id', 'actor_name', 'quotes'], keep=False)]
print(len(df_duplicates))

0


In [None]:
# get unique article_ids from unique_articles
unique_article_ids = unique_articles.article_id.unique()
print(len(unique_article_ids))

# merge with df_aggregated on article_id left
df_aggregated_merged = pd.merge(unique_articles, df_aggregated, on='article_id', how='left')


In [None]:
# save the dataframe
df_aggregated.to_csv('extracted_quotes_aggregated_Mistral.csv', sep=';', index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [241]:
# clean actor_name in df_aggregated and df
df_aggregated['actor_name'] = df_aggregated['actor_name'].str.strip()
df['actor_name'] = df['actor_name'].str.strip()

df_aggregated['actor_name'] = df_aggregated['actor_name'].str.replace('  ', ' ')
df['actor_name'] = df['actor_name'].str.replace('  ', ' ')

# title the first letter of each word
df_aggregated['actor_name'] = df_aggregated['actor_name'].str.title()
df['actor_name'] = df['actor_name'].str.title()

# clean actor_name in df_aggregated and df
df_aggregated['actor_name'] = df_aggregated['actor_name'].str.strip()
df['actor_name'] = df['actor_name'].str.strip()

# Categorize Actor Functions

In [None]:
df = pd.read_csv('path_to_df/actors_with_sentences.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
df = df[df.actor_type != 'Geopolitieke entiteit']
df = df[df.about_covid == 1]
df = df[df.actors_present == 1]

In [101]:
df.columns
df.actor_function.value_counts(dropna=False)

actor_function
Wetenschappelijke/medische organisaties en onderzoekers                                                                                                             72
NL - Nationale regering - executive / uitvoerende macht                                                                                                             37
Openbare en semiopenbare instellingen                                                                                                                               36
Niet-governementele organisaties (NGO), maatschappelijke organisaties, en hun leden                                                                                 35
Publiek en leden van het publiek, publieke opiniepeilingen en hun respondenten                                                                                      27
NL - Nationaal parlement en nationale partijen – wetgevende macht                                                                                     

In [102]:
df['actor_function_new'] = df['actor_function'].str.strip().copy()
# change actor function categories
df.loc[df.actor_function == 'NL - Nationale regering - executive / uitvoerende macht', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'NL - Nationaal parlement en nationale partijen – wetgevende macht', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'NL - Nationale regionale en lokale politieke organisaties en hun ambtenaren', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'NL - Nationale staatsorganisaties en hun ambtenaren', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'NL - Nationale koninklijke familie en haar leden', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'Regeringen/regeringsleiders/regeringsleden en/of andere politici in een ander land dan NL, op nationaal of lokaal niveau OF staatsorganisaties en hun ambtenaren', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'EU-instellingen en Internationale overheidsorganisaties (ook IGO’s) en hun leden', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'Nationale en internationale rechterlijke macht', 'actor_function_new'] = 'A'
df.loc[df.actor_function == 'Wetenschappelijke/medische organisaties en onderzoekers', 'actor_function_new'] = 'B'
df.loc[df.actor_function == 'Openbare en semiopenbare instellingen', 'actor_function_new'] = 'B'
df.loc[df.actor_function == 'Zakelijke organisaties en hun werknemers', 'actor_function_new'] = 'B'
df.loc[df.actor_function == 'Bekende mediapersonen (anders dan journalisten)', 'actor_function_new'] = 'B'
df.loc[df.actor_function == 'Journalisten anders dan de schrijver van het huidige artikel of nieuwsorganisaties anders dan de nieuwsorganisatie van het huidige artikel.', 'actor_function_new'] = 'B'
df.loc[df.actor_function == 'Niet-governementele organisaties (NGO), maatschappelijke organisaties, en hun leden', 'actor_function_new'] = 'C'
df.loc[df.actor_function == 'Religieuze instellingen en hun leden (ook gelovigen)', 'actor_function_new'] = 'C'
df.loc[df.actor_function == 'Publiek en leden van het publiek, publieke opiniepeilingen en hun respondenten', 'actor_function_new'] = 'D'

# see where actor_function_new is NaN
df[df['actor_function_new'].isna() == True]

Unnamed: 0,coder,article_id,actor_type,actor_function,actor_function_text,actor_pp,directly_quoted,indirectly_quoted,nr_words,talks_covid_measures,...,unique_id,about_covid,actors_present,Text,name_variations,actor_name,Text_y,relevant_sentences,relevant_sentences_string,actor_function_new


In [103]:
df.actor_function_new.value_counts(dropna=False)

actor_function_new
B    138
A     95
C     36
D     27
Name: count, dtype: int64

In [105]:
# get only article_id, actor_name and relevant_sentences
df_selected = df[['article_id', 'actor_name', 'actor_function_new', 'relevant_sentences_string']].drop_duplicates()
print(df_selected.shape)

# sort by article_id, actor_name
df_selected = df_selected.sort_values(by=['article_id', 'actor_name'])
df = df.sort_values(by=['article_id', 'actor_name'])

(296, 4)


### Prompt for actor function categorization

In [27]:
prompt = """
You are a helpful AI assistant. You will receive names or descriptions of persons or organizations, along with sentences from news articles where these entities are mentioned. Your task is to read all of this information about the entity and decide which function category the entity belongs to.

### Function Categories:
A. Government and politics: includes the following entities:
    - The Dutch government, governmental organizations, and members and representatives of these organizations.
    - Dutch parliament, parliement members and political parties and members and representatives of these organizations.
    - National, regional and local political organizations and members and representatives of these organizations.
    - Dutch state organizations, law enforcement, and members and representatives of these organizations.
    - Civil servants and other governmental officials.
    - Foreign national, regional and local politicians, political organizations and members and representatives of these organizations.
    - International governmental organizations and members and representatives of these organizations.
    - National and international judiciary.
    - Dutch royal family and its members.
    - Other national, foreign or international political figures or organizations.
B. Professionals and experts: includes the following entities:
    - Scientists, medical organizations, and researchers.
    - Public and semi-public institutions.
    - Corporate organizations and their employees.
    - Journalists and news organizations.
    - Prominent media personalities and celebrities.
    - Other professionals and experts.
C. Civil society organizations: includes the following entities:
    - Non-governmental organizations (NGOs), civil society organizations, and members and representatives of these organizations.
    - Interest groups, business associations, sports associations, trade unions and members and representatives of these organizations.
    - Religious institutions and their members (also believers).
    - Other social non-governmental organizations, interest groups and their representatives.
D. Citizens and members of the public:
    - The general public, public opinion polls, and their respondents.

Follow the instructions below to complete the task:

1. Carefully review the following extracted information from the news article with ID {article_id} about the entity: {actor_name}. The entity is mentioned in the following sentences: {relevant_sentences_string}. 

2. Determine the function category of the given entity based on the information provided. Do not extract additional entities. 

3. Provide the information in the JSON format below. Do not include additional information or explanations.

### Example Output (JSON format):
{{
  "article_id": "2000000",
  "actor_name": "Mark Rutte",
  "actor_function_category": "A"
}}
{{
  "article_id": "2000001",
  "actor_name": "De Volkskrant",
  "actor_function_category": "B"
}}
{{
  "article_id": "2000002", 
  "actor_name": "RIVM",
  "actor_function_category": "B"
}}
"""

In [28]:
zero_shot_prompt = zero_shot_prompt_messages(prompt)
print(zero_shot_prompt)

<s> [INST] 
You are a helpful AI assistant. You will receive names or descriptions of persons or organizations, along with sentences from news articles where these entities are mentioned. Your task is to read all of this information about the entity and decide which function category the entity belongs to.

### Function Categories:
A. Government and politics: includes the following entities:
    - The Dutch government, governmental organizations, and members and representatives of these organizations.
    - Dutch parliament, parliement members and political parties and members and representatives of these organizations.
    - National, regional and local political organizations and members and representatives of these organizations.
    - Dutch state organizations, law enforcement, and members and representatives of these organizations.
    - Civil servants and other governmental officials.
    - Foreign national, regional and local politicians, political organizations and members and 

In [29]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "actor_name", "relevant_sentences_string"],
    template=zero_shot_prompt
)

In [30]:
chain_one = LLMChain(llm = llm, prompt = prompt_template)
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['actor_name', 'article_id', 'relevant_sentences_string'], template='<s> [INST] \nYou are a helpful AI assistant. You will receive names or descriptions of persons or organizations, along with sentences from news articles where these entities are mentioned. Your task is to read all of this information about the entity and decide which function category the entity belongs to.\n\n### Function Categories:\nA. Government and politics: includes the following entities:\n    - The Dutch government, governmental organizations, and members and representatives of these organizations.\n    - Dutch parliament, parliement members and political parties and members and representatives of these organizations.\n    - National, regional and local political organizations and members and representatives of these organizations.\n    - Dutch state organizations, law enforcement, and members and representatives of these organizations.\n    - Civil servants and o

In [31]:
generated_text_zeroshot_actors = []

In [None]:
%%time
torch.manual_seed(0)
for index, row in df_selected.iterrows():  
    article_id = row['article_id']
    actor_name = row['actor_name']
    relevant_sentences_string = row['relevant_sentences_string']

    input_variables = {
            "article_id": article_id,
            "actor_name": actor_name,
            "relevant_sentences_string": relevant_sentences_string}
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot_actors.append(generated_text)  

# CPU times: user 9min 7s, sys: 2min 5s, total: 11min 12s
# Wall time: 11min 13s

In [33]:
def generated_functions_extractor(generated_text):
    # Initialize lists to store extracted data
    article_ids = []
    actor_names = []
    actor_functions = []

    # Join the lines into a single string
    data_str = '\n'.join(generated_text)

    # clean the data_str from \
    data_str = data_str.replace('\\', '')

    # Define regex patterns to extract the necessary information
    article_id_pattern = re.compile(r'"article_id":\s*"(\d+)"')
    actor_name_pattern = re.compile(r'"actor_name":\s*"([^"]+)"')
    actor_function_pattern = re.compile(r'"actor_function_category":\s*"([^"]+)"')

    # Split the data string into individual article sections
    # article_sections = re.split(r'\}\s*{', data_str)
    article_sections = re.split(r'(?="article_id")', data_str)

    for article_section in article_sections:
        article_id_match = article_id_pattern.search(article_section)
        if not article_id_match:
            continue
        article_id = article_id_match.group(1)

        actor_name_match = actor_name_pattern.search(article_section)
        actor_function_match = actor_function_pattern.search(article_section)

        article_ids.append(article_id)
        actor_names.append(actor_name_match.group(1) if actor_name_match else None)
        actor_functions.append(actor_function_match.group(1) if actor_function_match else None)

    # Create DataFrame
    df = pd.DataFrame({
        'article_id': article_ids,
        'actor_name': actor_names,
        'actor_function_category': actor_functions
    })

    return df

In [None]:
generated_functions_df = generated_functions_extractor(generated_text_zeroshot_actors)

In [107]:
# drop if actor name is RIVM and article id is 2355910
generated_functions_df = generated_functions_df[~((generated_functions_df['actor_name'] == 'RIVM') & (generated_functions_df['article_id'] == '2355910'))]
# reset index
generated_functions_df = generated_functions_df.reset_index(drop=True)
print(generated_functions_df.shape)


(296, 3)


In [109]:
print(generated_functions_df.shape)
print(df_selected.shape)

(296, 3)
(296, 4)


In [110]:
df_selected.actor_function_new.value_counts(dropna=False)

actor_function_new
B    138
A     95
C     36
D     27
Name: count, dtype: int64

In [111]:
generated_functions_df.actor_function_category.value_counts(dropna=False)

actor_function_category
B    109
C     87
A     85
D     15
Name: count, dtype: int64

In [112]:
# make article_id integer
df_selected['actor_function_mistral'] = generated_functions_df['actor_function_category']
df_selected['actor_name_mistral'] = generated_functions_df['actor_name']

In [114]:
# drop if input_text_lower is null
df_selected = df_selected.dropna(subset = ['relevant_sentences_string'])
print(df_selected.shape)

(294, 6)


In [115]:
# show the crosstab
pd.crosstab(df_selected.actor_function_new, df_selected.actor_function_mistral)

actor_function_mistral,A,B,C,D
actor_function_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,73,4,17,1
B,10,95,30,1
C,1,5,30,0
D,1,4,10,12


In [117]:
print(classification_report(df_selected['actor_function_new'], df_selected['actor_function_mistral']))

              precision    recall  f1-score   support

           A       0.86      0.77      0.81        95
           B       0.88      0.70      0.78       136
           C       0.34      0.83      0.49        36
           D       0.86      0.44      0.59        27

    accuracy                           0.71       294
   macro avg       0.74      0.69      0.67       294
weighted avg       0.81      0.71      0.74       294

