In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import csv
import re
import pandas as pd
import transformers
import torch

from torch import cuda, bfloat16
from transformers import AutoTokenizer

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import classification_report, cohen_kappa_score


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
os.getcwd()

# go one level up in the directory
os.chdir("/data/500gbstorage/")

huggingface_cache_dir = 'model_mistral'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

# Load the Model from Huggingface

In [None]:
torch.manual_seed(0)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)
# set model access token for huggingface
hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=bfloat16,
    device_map='auto',
    token=hf_token,
    cache_dir=huggingface_cache_dir)
model.eval()

cuda:0


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.09it/s]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
     

In [None]:
torch.manual_seed(0)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token, cache_dir=huggingface_cache_dir)

print(f"Model loaded on {device}")

Model loaded on cuda:0


In [None]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A100 80GB PCIe
Memory Usage: 13.488792419433594 GB
Max Memory Usage: 13.488792419433594 GB


In [None]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.0,  
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # how much to avoid repeating the same word
)

llm = HuggingFacePipeline(pipeline=generate_text)

# Helper functions

In [None]:
def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    article_ids = []
    topic_discussed_values = []

    # Define regex pattern for article_id and about_covid with optional double quotes
    article_id_pattern = r'"article_id"\s*:\s*"?(\d+)"?'
    topic_discussed_pattern = r'"subtopic_discussed"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # Clean the string_data: change "\\_" pattern to "_"
        string_data = string_data.replace('\\_', '_',)

        # Use regex to find matches for article_id
        article_id_match = re.search(article_id_pattern, string_data)

        # Use regex to find matches for about_covid
        topic_discussed_match = re.search(topic_discussed_pattern, string_data)

        # Extract values from the regex matches
        article_id = int(article_id_match.group(1)) if article_id_match else None
        topic_discussed = int(topic_discussed_match.group(1)) if topic_discussed_match else None

        # Append values to the respective lists
        article_ids.append(article_id)
        topic_discussed_values.append(topic_discussed)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "article_id": article_ids,
        "subtopic_discussed": topic_discussed_values
    })

    return df


In [None]:
def zero_shot_prompt_messages(prompt_text):
    messages = [
        {"role": "user", "content": prompt_text}
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

# Data Prep

In [None]:
# Get the NOS articles annotated by the researcher
df = pd.read_csv('NOS/nos_analysis/topic_tests_reliability_data/reliability_topics_elif.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

df = df[df['about_covid'] == 1]

In [None]:
# articles df including text, category, keywords 
articles_df = pd.read_csv('NOS/nos_analysis/data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

In [None]:
# unique articles from researcher annotations
unique_articles = df[['article_id', 'Keywords', 'Category', 'Text']].drop_duplicates()
print(unique_articles.shape)

# Prompt Topic M

In [22]:
prompt_text = """
As a helpful AI assistant, your task is to determine whether the news article after <<<>>> about the COVID-19 pandemic substantially discusses the subtopic "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics".
Substantial discussion of a subtopic means that the article discusses one or more aspects of the subtopic in at least two sentences. 

Discussion of "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics" may focus on one or more of the following aspects:

- Changes in political priorities and agendas in response to the coronavirus pandemic.
- Parliamentary discussions about the coronavirus pandemic and coronavirus pandemic-related policies.
- Allocation of government resources and investments in healthcare due to the coronavirus pandemic.
- Re-evaluation of policy areas such as public health and crisis management.
- Discussions about the effectiveness of political leaders' responses to the coronavirus pandemic.
- Communication with the public during the crisis.
- Accountability of governments and policymakers during the crisis.

Read the following news article with the ID {article_id}: <<< {text} >>>

This article falls under the categories: {category} and contains the keywords: {keywords}. 

Take a moment to understand the article. 
Remember, for a subtopic to be substantially discussed, the article must discuss one or more aspects of the subtopic in at least two sentences.

Carefully analyze if the article contains substantial discussion of "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics" based on the definition above.
 
Assign a value of 1 if the article substantially discusses the subtopic, and a value of 0 if the article does not substantially discuss the subtopic.

Your response should be in the following JSON format, without any additional explanations or notes:

{{
    "article_id": "2000000",
  "subtopic_discussed": 0/1
}}
"""

# Zero-Shot - Topic M

In [27]:
zero_shot_prompt = zero_shot_prompt_messages(prompt_text)
print(zero_shot_prompt)

<s> [INST] 
As a helpful AI assistant, your task is to determine whether the news article after <<<>>> about the COVID-19 pandemic substantially discusses the subtopic "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics".
Substantial discussion of a subtopic means that the article discusses one or more aspects of the subtopic in at least two sentences. 

Discussion of "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics" may focus on one or more of the following aspects:

- Changes in political priorities and agendas in response to the coronavirus pandemic.
- Parliamentary discussions about the coronavirus pandemic and coronavirus pandemic-related policies.
- Allocation of government resources and investments in healthcare due to the coronavirus pandemic.
- Re-evaluation of policy areas such as public health and crisis management.
- Discussions about the effectiveness of politic

In [28]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=zero_shot_prompt
)

In [29]:
chain_one = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, subtopic_discussed")
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s> [INST] \nAs a helpful AI assistant, your task is to determine whether the news article after <<<>>> about the COVID-19 pandemic substantially discusses the subtopic "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics".\nSubstantial discussion of a subtopic means that the article discusses one or more aspects of the subtopic in at least two sentences. \n\nDiscussion of "Political discussions about the coronavirus/COVID-19 pandemic and the effects of the pandemic on politics" may focus on one or more of the following aspects:\n\n- Changes in political priorities and agendas in response to the coronavirus pandemic.\n- Parliamentary discussions about the coronavirus pandemic and coronavirus pandemic-related policies.\n- Allocation of government resources and investments in healthcare due to the coronavirus pandemic.\n- Re-evalua

In [None]:
generated_text_zeroshot_m = []

In [None]:
%%time
torch.manual_seed(0)
for index, row in unique_articles.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot_m.append(generated_text)     

# CPU times: user 2min 43s, sys: 4.63 s, total: 2min 47s
# Wall time: 2min 47s
# output cleaned

In [None]:
json_list = regex_extract_to_dataframe(generated_text_zeroshot_m)
df_topic_m = pd.DataFrame(json_list)
# drop nan
df_topic_m = df_topic_m.dropna()

In [None]:
df_elif = df[df['coder'] == 'Elif Kilik']
df_coded_elif = df_elif[['article_id', 'topic_m']]
df_zeroshot_merged_elif = pd.merge(df_coded_elif, df_topic_m, how='left', on="article_id")
df_zeroshot_merged_elif = df_zeroshot_merged_elif.dropna()
df_zeroshot_merged_elif['topic_m_pred'] = df_zeroshot_merged_elif['topic_m_pred'].astype(int)

In [42]:
print(classification_report(df_zeroshot_merged_elif['topic_m'], df_zeroshot_merged_elif['topic_m_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_zeroshot_merged_elif['topic_m'], df_zeroshot_merged_elif['topic_m_pred']))

              precision    recall  f1-score   support

           0       0.95      0.83      0.89        72
           1       0.20      0.50      0.29         6

    accuracy                           0.81        78
   macro avg       0.58      0.67      0.59        78
weighted avg       0.89      0.81      0.84        78

Cohen's Kappa: 0.19753086419753096
