In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import csv
import re
import pandas as pd
import transformers
import torch

from torch import cuda, bfloat16
from transformers import AutoTokenizer

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import classification_report, cohen_kappa_score


In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
os.getcwd()

# go one level up in the directory
os.chdir("/data/500gbstorage/")

huggingface_cache_dir = 'model_mistral'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

# Load the Model from Huggingface

In [None]:
torch.manual_seed(0)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(device)
# set model access token for huggingface
hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=bfloat16,
    device_map='auto',
    token=hf_token,
    cache_dir=huggingface_cache_dir)
model.eval()

torch.manual_seed(0)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token, cache_dir=huggingface_cache_dir)

print(f"Model loaded on {device}")

cuda:0


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.20it/s]


Model loaded on cuda:0


In [12]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A100 80GB PCIe
Memory Usage: 13.488792419433594 GB
Max Memory Usage: 13.488792419433594 GB


In [13]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

# Helper functions

In [14]:
def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    article_ids = []
    topic_discussed_values = []

    # Define regex pattern for article_id and about_covid with optional double quotes
    article_id_pattern = r'"article_id"\s*:\s*"?(\d+)"?'
    topic_discussed_pattern = r'"subtopic_discussed"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # Clean the string_data: change "\\_" pattern to "_"
        string_data = string_data.replace('\\', '',)

        # Use regex to find matches for article_id
        article_id_match = re.search(article_id_pattern, string_data)

        # Use regex to find matches for about_covid
        topic_discussed_match = re.search(topic_discussed_pattern, string_data)

        # Extract values from the regex matches
        article_id = int(article_id_match.group(1)) if article_id_match else None
        topic_discussed = int(topic_discussed_match.group(1)) if topic_discussed_match else None

        # Append values to the respective lists
        article_ids.append(article_id)
        topic_discussed_values.append(topic_discussed)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "article_id": article_ids,
        "topic_discussed": topic_discussed_values
    })

    return df


In [15]:
def zero_shot_prompt_messages(prompt_text):
    messages = [
        {"role": "user", "content": prompt_text}
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

# Data Prep

In [None]:
# Get the NOS articles annotated by the researcher
df = pd.read_csv('NOS/nos_analysis/topic_tests_reliability_data/reliability_topics_elif.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

df = df[df['about_covid'] == 1]

In [None]:
# articles df including text, category, keywords 
articles_df = pd.read_csv('NOS/nos_analysis/data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

In [None]:
# unique articles from researcher annotations
unique_articles = df[['article_id', 'Keywords', 'Category', 'Text']].drop_duplicates()
print(unique_articles.shape)

(82, 4)


# Prompt Topic A

In [26]:
prompt_text = """
You are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article after <<<>>> substantially discusses the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics". 
Substantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. 

The discussion on the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics" may focus on one or more of the following aspects:

- The current status and spread of the Coronavirus/COVID-19 pandemic.
- Any discussion on the number/s or percentage/s of:
    - confirmed coronavirus/COVID-19 cases, people who got infected with coronavirus, 
    - people who got tested for coronavirus, positive/negative coronavirus test results, 
    - COVID-19/corona patients, hospitalized patients, recovered patients, percentage of intensive care beds used, IC capacity, 
    - people who died because of coronavirus\COVID-19, death rates, etc.
    - Regional, national, or international statistical data regarding coronavirus or COVID-19 cases. 
- Discussions around the decreasing/increasing corona numbers or percentages.
- Discussions about the reproduction number (Reproductie/R getal).
- Any other relevant national or international statistics/data about the Coronavirus/COVID-19 pandemic.

Read the following news article with the ID {article_id}: <<< {text} >>> \n
This article falls under the categories: {category} and contains the keywords: {keywords}. 

Take a moment to understand the article. 

Remember, for a subtopic to be substantially discussed, the article must discuss one or more aspects of the subtopic in at least two sentences.

Carefully analyze if the article contains substantial discussion of "Current status on the spread of the COVID-19 pandemic and pandemic statistics" based on the definition above. 

If yes, respond with "1". If no, respond with "0". 

Your response should be in the following JSON format, without any additional explanations or notes:

{{
  "article_id": "2000000",
  "subtopic_discussed": 0/1
}}
"""

# Zero-Shot - Topic A

In [30]:
zero_shot_prompt_a = zero_shot_prompt_messages(prompt_text)
print(zero_shot_prompt_a)

<s> [INST] 
You are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article after <<<>>> substantially discusses the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics". 
Substantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. 

The discussion on the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics" may focus on one or more of the following aspects:

- The current status and spread of the Coronavirus/COVID-19 pandemic.
- Any discussion on the number/s or percentage/s of:
    - confirmed coronavirus/COVID-19 cases, people who got infected with coronavirus, 
    - people who got tested for coronavirus, positive/negative coronavirus test results, 
    - COVID-19/corona patients, hospitalized patients, recovered patients, percentage of intensive care beds used, IC capacity, 
    - people who died beca

In [31]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=zero_shot_prompt_a
)

In [32]:
chain_a = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, subtopic_discussed")
chain_a

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s> [INST] \nYou are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article after <<<>>> substantially discusses the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics". \nSubstantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. \n\nThe discussion on the subtopic "Current status on the spread of the COVID-19 pandemic and pandemic statistics" may focus on one or more of the following aspects:\n\n- The current status and spread of the Coronavirus/COVID-19 pandemic.\n- Any discussion on the number/s or percentage/s of:\n    - confirmed coronavirus/COVID-19 cases, people who got infected with coronavirus, \n    - people who got tested for coronavirus, positive/negative coronavirus test results, \n    - COVID-19/corona patients, hospital

In [33]:
generated_text_zeroshot_a = []

In [None]:
%%time
torch.manual_seed(0)

for index, row in unique_articles.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']

    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }

    # Generate text using the chain
    generated_text = chain_a.run(input_variables)
    print(generated_text)
    generated_text_zeroshot_a.append(generated_text)    

# CPU times: user 2min 2s, sys: 2.35 s, total: 2min 4s
# Wall time: 2min 4s
# output cleaned

In [None]:
json_list = regex_extract_to_dataframe(generated_text_zeroshot_a)
df_topic_a = pd.DataFrame(json_list)

In [None]:
df_topic_a = df_topic_a.rename(columns={'topic_discussed': 'topic_a_pred'})
df_topic_a['article_id'] = df_topic_a['article_id'].astype(int)
df_topic_a['about_covid_pred'] = df_topic_a['topic_a_pred'].astype(int)

In [None]:
df_elif = df[df['coder'] == 'Elif Kilik']
df_coded_elif = df_elif[['article_id', 'topic_a']]
df_zeroshot_merged_elif = pd.merge(df_coded_elif, df_topic_a, how='left', on="article_id")
df_zeroshot_merged_elif = df_zeroshot_merged_elif.dropna()
df_zeroshot_merged_elif['topic_a_pred'] = df_zeroshot_merged_elif['topic_a_pred'].astype(int)


In [45]:
print(classification_report(df_zeroshot_merged_elif['topic_a'], df_zeroshot_merged_elif['topic_a_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_zeroshot_merged_elif['topic_a'], df_zeroshot_merged_elif['topic_a_pred']))

              precision    recall  f1-score   support

           0       0.94      0.88      0.91        57
           1       0.72      0.86      0.78        21

    accuracy                           0.87        78
   macro avg       0.83      0.87      0.85        78
weighted avg       0.88      0.87      0.88        78

Cohen's Kappa: 0.6926713947990544
