In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import os
import csv
import re
import pandas as pd
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
from torch import cuda, bfloat16
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.llms import HuggingFacePipeline
from sklearn.metrics import classification_report

In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Load the Model from Huggingface

In [None]:
os.getcwd()

# go one level up in the directory
huggingface_cache_dir = 'model_mistral'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

In [None]:
torch.manual_seed(0)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)
# set model access token for huggingface
hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=bfloat16,
    device_map='auto',
    token=hf_token,
    cache_dir=huggingface_cache_dir)
model.eval()

torch.manual_seed(0)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token, cache_dir=huggingface_cache_dir)

In [10]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=3000,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
def zero_shot_prompt_messages(prompt_text):
    messages = [
        {"role": "user", "content": prompt_text}
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt
    
def count_words(sentences_list):
    # Join the list into a single string
    if sentences_list not in [None, np.nan]:
        full_text = ' '.join(sentences_list)
        # Split the string into words and count them
        word_count = len(full_text.split())
    else:
        word_count = 0
    return word_count

def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    source_names = []
    measures_mentioned_values = []
    supportive_stance_values = []
    critical_stance_values = []

    # Define regex pattern for structured output
    assistant_output_pattern = r'\[/INST]\s*({.*})'
    
    # Define regex patterns for JSON fields
    source_name_pattern = r'"source_name"\s*:\s*"?([^"]+)"?'
    measures_mentioned_pattern = r'"measures_mentioned"\s*:\s*"?(\d)"?'
    supportive_stance_pattern = r'"supportive_stance"\s*:\s*"?(\d)"?'
    critical_stance_pattern = r'"critical_stance"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # Extract content after "<|end_of_turn|>GPT4 Correct Assistant:"
        match = re.search(assistant_output_pattern, string_data, re.DOTALL)
        if match:
            json_data = match.group(1).replace("\\", "")  # Extract and clean JSON part
        else:
            continue  # Skip this string if no match found

        # Use regex to find matches for each field
        source_name_match = re.search(source_name_pattern, json_data)
        measures_mentioned_match = re.search(measures_mentioned_pattern, json_data)
        supportive_stance_match = re.search(supportive_stance_pattern, json_data)
        critical_stance_match = re.search(critical_stance_pattern, json_data)

        # Extract values from regex matches
        source_name = source_name_match.group(1) if source_name_match else None
        measures_mentioned = int(measures_mentioned_match.group(1)) if measures_mentioned_match else None
        supportive_stance = int(supportive_stance_match.group(1)) if supportive_stance_match else None
        critical_stance = int(critical_stance_match.group(1)) if critical_stance_match else None

        # Append values to the respective lists
        source_names.append(source_name)
        measures_mentioned_values.append(measures_mentioned)
        supportive_stance_values.append(supportive_stance)
        critical_stance_values.append(critical_stance)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "source_name": source_names,
        "measures_mentioned": measures_mentioned_values,
        "supportive_stance": supportive_stance_values,
        "critical_stance": critical_stance_values
    })

    return df


# Retrieve Actor DF

In [None]:
df = pd.read_csv('path_to_test_data_researcher_codings',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)
# keep only if directly_quoted or indirectly_quoted is 1
df = df[(df['directly_quoted'] == 1) | (df['indirectly_quoted'] == 1)]

In [13]:
# get article_id, actor_name, talks_covid_measures, relevant_sentences_string
df = df[['article_id', 'actor_name', 'talks_covid_measures', 'input_text_corrected',
       'talks_covid_corrected', 'measures_positive_corrected',
       'measures_negative_corrected', 'measures_neutral_corrected']]

In [None]:
# rename actor_name to source_name
df_selected = df.rename(columns={'actor_name': 'source_name'})

# sort by article_id, actor_name
df_selected = df_selected.sort_values(by=['article_id', 'source_name'])

# Does the Actor Talk about Covid Measures? 

In [23]:
prompt_text = """
You are a helpful AI assistant. Sources in a news article are people or organizations that are quoted in the article. 
You will be provided with the names of these sources and the sentences from the news article where these sources are mentioned. Your task is to determine if the source discusses any COVID-19 measures and, if so, assess the source's overall stance towards these measures.

Some examples of COVID-19 Measures:
1. General Coronavirus Measures: Broad discussions about pandemic measures and advice without specifying details.
2. Curfew, Lockdown, Quarantine Rules: References to staying home, lockdowns, quarantines, and related rules.
3. COVID-19 Testing: Discussions about testing requirements and access cards or travel passes related to testing (e.g., Testing for Access (1G)).
4. Vaccination and Procedures: Discussions about vaccines, vaccination processes, side effects, and access cards or travel passes related to vaccines (e.g., 2G or 3G passes).
5. Measures about Sports, Events and Gatherings: Regulations about events, sports, gatherings, and social distancing.
6. Closing of Schools, Restaurants, Shops: References to closures of schools, restaurants, shops, or other public places.
7. Other advice and Recommendations: Wearing face masks, working from home, social distancing, limiting visits, and other health advice.

Read the extracted information from the news article with ID {article_id} about the source: {source_name}. The following text begins with the source name and provides the sentences the source is mentioned in the article: {input_text_corrected}.

1. Carefully review the information provided.

2. Determine if the source mentions any COVID-19 measures:
   - If yes, set measures_mentioned to 1.
   - If no, set measures_mentioned to 0.

3. If measures_mentioned is 1, classify the source's stance towards the COVID-19 measures with two binary flags. 
   - Supportive stance: Set supportive_stance to 1 if the source iS positive about the measures, expresses general support or approval towards measures, or generally agrees with the existance of measures.
   - Critical stance: Set critical_stance to 1 the source is negative about the measures,expresses general opposition or criticism towards measures, discusses negative aspects, or disagrees with the existance of  measures.
   - Set supportive_stance and critical_stance to 0 if the source does not mention COVID-19 measures at all or does not express a supportive or critical stance.
   - Keep in mind that the source may express both supportive and critical stances, so both flags can be 1.

4. Provide the result in the following JSON format. Do not include any additional information or explanation.

### Example Output (JSON format):
{{
  "article_id": "2000000",
  "source_name": "Herman Kroneman",
  "measures_mentioned": 1,
  "supportive_stance": 1,
  "critical_stance": 0
}}
{{
  "article_id": "2000001",
  "source_name": "De Volkskrant",
  "measures_mentioned": 0,
  "supportive_stance": 0,
  "critical_stance": 0
}}
{{
  "article_id": "2000002",
  "source_name": "RIVM",
  "measures_mentioned": 1,
  "supportive_stance": 0,
  "critical_stance": 1
}}
"""

In [24]:
zero_shot_prompt = zero_shot_prompt_messages(prompt_text=prompt_text)
print(zero_shot_prompt)

<s> [INST] 
You are a helpful AI assistant. Sources in a news article are people or organizations that are quoted in the article. 
You will be provided with the names of these sources and the sentences from the news article where these sources are mentioned. Your task is to determine if the source discusses any COVID-19 measures and, if so, assess the source's overall stance towards these measures.

Some examples of COVID-19 Measures:
1. General Coronavirus Measures: Broad discussions about pandemic measures and advice without specifying details.
2. Curfew, Lockdown, Quarantine Rules: References to staying home, lockdowns, quarantines, and related rules.
3. COVID-19 Testing: Discussions about testing requirements and access cards or travel passes related to testing (e.g., Testing for Access (1G)).
4. Vaccination and Procedures: Discussions about vaccines, vaccination processes, side effects, and access cards or travel passes related to vaccines (e.g., 2G or 3G passes).
5. Measures abou

In [29]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "source_name", "input_text_corrected"],
    template=zero_shot_prompt
)

In [30]:
chain_one = LLMChain(llm = llm, prompt = prompt_template)
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'input_text_corrected', 'source_name'], template='<s> [INST] \nYou are a helpful AI assistant. Sources in a news article are people or organizations that are quoted in the article. \nYou will be provided with the names of these sources and the sentences from the news article where these sources are mentioned. Your task is to determine if the source discusses any COVID-19 measures and, if so, assess the source\'s overall stance towards these measures.\n\nSome examples of COVID-19 Measures:\n1. General Coronavirus Measures: Broad discussions about pandemic measures and advice without specifying details.\n2. Curfew, Lockdown, Quarantine Rules: References to staying home, lockdowns, quarantines, and related rules.\n3. COVID-19 Testing: Discussions about testing requirements and access cards or travel passes related to testing (e.g., Testing for Access (1G)).\n4. Vaccination and Procedures: Discussions about vaccines, vaccination

In [31]:
generated_text_zeroshot_actors = []

In [None]:
%%time
torch.manual_seed(0)
for index, row in df_selected.iterrows():  
    article_id = row['article_id']
    source_name = row['source_name']
    input_text_corrected = row['input_text_corrected']
    input_variables = {
            "article_id": article_id,
            "source_name": source_name,
            "input_text_corrected": input_text_corrected}
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot_actors.append(generated_text)  

# CPU times: total: 12min 25s
# Wall time: 12min 54s  

In [None]:
df_generated = regex_extract_to_dataframe(generated_text_zeroshot_actors)

In [49]:
df_generated.measures_mentioned.value_counts(dropna=False)

measures_mentioned
0    180
1    115
Name: count, dtype: int64

In [50]:
df_generated.supportive_stance.value_counts(dropna=False)

supportive_stance
0    264
1     31
Name: count, dtype: int64

In [51]:
df_generated.critical_stance.value_counts(dropna=False)

critical_stance
0    287
1      8
Name: count, dtype: int64

In [None]:
# merge the generated data with the original data on index
df_merged = df_selected.merge(df_generated, left_index=True, right_index=True)

In [None]:
# add mentions_covid_measures to the original dataframe
df_selected['measures_mentioned'] = df_generated['measures_mentioned']
df_selected['supportive_stance'] = df_generated['supportive_stance']
df_selected['critical_stance'] = df_generated['critical_stance']
df_selected['actor_name_starling'] = df_selected['source_name']

In [54]:
# see where source_name and actor_name are different
df_selected[df_selected['source_name'] != df_selected['actor_name_starling']]

Unnamed: 0,article_id,source_name,talks_covid_measures,input_text_corrected,talks_covid_corrected,measures_positive_corrected,measures_negative_corrected,measures_neutral_corrected,measures_mentioned,supportive_stance,critical_stance,actor_name_starling


In [57]:
# crosstab talks_covid_measures and mentions_covid_measures
pd.crosstab(df_selected['talks_covid_measures'], df_selected['measures_mentioned'])

measures_mentioned,0,1
talks_covid_measures,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,111,35
1.0,69,80


In [59]:
pd.crosstab(df_selected['measures_positive_corrected'], df_selected['supportive_stance'])

supportive_stance,0,1
measures_positive_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,242,13
1.0,22,18


In [60]:
pd.crosstab(df_selected['measures_negative_corrected'], df_selected['critical_stance'])

critical_stance,0,1
measures_negative_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,262,4
1.0,25,4


In [61]:
# get classification report
print(classification_report(df_selected['talks_covid_measures'], df_selected['measures_mentioned']))

              precision    recall  f1-score   support

         0.0       0.62      0.76      0.68       146
         1.0       0.70      0.54      0.61       149

    accuracy                           0.65       295
   macro avg       0.66      0.65      0.64       295
weighted avg       0.66      0.65      0.64       295



In [62]:
talks_covid = df_selected[df_selected['talks_covid_measures'] == 1]

In [63]:
print(classification_report(talks_covid['measures_positive_corrected'], talks_covid['supportive_stance']))

              precision    recall  f1-score   support

         0.0       0.84      0.89      0.87       113
         1.0       0.59      0.47      0.52        36

    accuracy                           0.79       149
   macro avg       0.71      0.68      0.70       149
weighted avg       0.78      0.79      0.78       149



In [64]:
print(classification_report(talks_covid['measures_negative_corrected'], talks_covid['critical_stance']))

              precision    recall  f1-score   support

         0.0       0.83      0.99      0.91       121
         1.0       0.80      0.14      0.24        28

    accuracy                           0.83       149
   macro avg       0.82      0.57      0.57       149
weighted avg       0.83      0.83      0.78       149



In [None]:
# save df
df_selected.to_csv('path_to_save_test_df_with_predictions/stance_Mistral.csv',
          sep = ';', encoding = 'utf-8', index = False)