# Load Packages & Set Working Directory

In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import csv
import re
import pandas as pd
import transformers
import torch

from torch import cuda, bfloat16
from transformers import AutoTokenizer

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import classification_report, cohen_kappa_score


In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
os.getcwd()

# go one level up in the directory
os.chdir("/data/500gbstorage/")

huggingface_cache_dir = 'model_mistral'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

# Load the Model from Huggingface

In [None]:
torch.manual_seed(0)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# set model access token for huggingface
hf_token = 'hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

print(device)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=bfloat16,
    device_map='auto',
    token=hf_token,
    cache_dir=huggingface_cache_dir)
model.eval()

torch.manual_seed(0)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, token=hf_token, cache_dir=huggingface_cache_dir)

print(f"Model loaded on {device}")

cuda:0


Downloading shards: 100%|██████████| 3/3 [00:00<00:00,  9.12it/s]
Loading checkpoint shards: 100%|██████████| 3/3 [10:27<00:00, 209.01s/it]


Model loaded on cuda:0


In [11]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A100 80GB PCIe
Memory Usage: 13.488792419433594 GB
Max Memory Usage: 13.488792419433594 GB


In [12]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

# Helper functions

In [13]:
def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    article_ids = []
    about_covid_values = []

    # Define regex pattern for article_id and about_covid with optional double quotes
    article_id_pattern = r'"article_id"\s*:\s*"?(\d+)"?'
    about_covid_pattern = r'"about_covid"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # replace \ with nothing
        string_data = string_data.replace("\\", "")
        
        # Use regex to find matches for article_id
        article_id_match = re.search(article_id_pattern, string_data)

        # Use regex to find matches for about_covid
        about_covid_match = re.search(about_covid_pattern, string_data)

        # Extract values from the regex matches
        article_id = int(article_id_match.group(1)) if article_id_match else None
        about_covid = int(about_covid_match.group(1)) if about_covid_match else None

        # Append values to the respective lists
        article_ids.append(article_id)
        about_covid_values.append(about_covid)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "article_id": article_ids,
        "about_covid": about_covid_values
    })

    return df


In [14]:
def zero_shot_prompt_messages(prompt_text):
    messages = [
        {"role": "user", "content": prompt_text}
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

# Data Prep

In [None]:
# Get the NOS articles annotated by the researcher
df = pd.read_csv('NOS/nos_analysis/topic_tests_reliability_data/reliability_topics_elif.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

In [None]:
# articles df including text, category, keywords 
articles_df = pd.read_csv('NOS/nos_analysis/data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

In [None]:
# unique articles from researcher annotations
unique_articles = df[['article_id', 'Keywords', 'Category', 'Text']].drop_duplicates()
print(unique_articles.shape)

(120, 4)


# Prompt building

In [24]:
prompt_text = """
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.

Read the following article with the ID {article_id}: <<< {text} >>>

This article falls under the categories: {category} and contains the keywords: {keywords}. 

Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 0 if the main topic is another subject.

Your response should be in the following JSON format, without any additional explanations or notes:

{{
    "article_id": "2000000",
    "about_covid": "1"
}}
"""

# Annotating Articles About Covid

In [29]:
zero_shot_prompt = zero_shot_prompt_messages(prompt_text)
print(zero_shot_prompt)

<s> [INST] 
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.

Read the following article with the ID {article_id}: <<< {text} >>>

This article falls under the categories: {category} and contains the keywords: {keywords}. 

Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 

In [30]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=zero_shot_prompt
)

In [31]:
chain_one = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, about_covid")
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s> [INST] \nAs a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.\nA main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.\n\nRead the following article with the ID {article_id}: <<< {text} >>>\n\nThis article falls under the categories: {category} and contains the keywords: {keywords}. \n\nTake a moment to understand the article. \nRemember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. \n\nBased on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or anothe

In [32]:
generated_text_zeroshot = []

In [None]:
%%time
torch.manual_seed(0)

for index, row in unique_articles.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot.append(generated_text)    

# CPU times: user 2min 3s, sys: 748 ms, total: 2min 4s
# Wall time: 2min 13s
# output cleaned

In [None]:
json_list = regex_extract_to_dataframe(generated_text_zeroshot)
df_zeroshot = pd.DataFrame(json_list)

In [38]:
# rename about_covid to about_covid_pred
df_zeroshot = df_zeroshot.rename(columns={'about_covid': 'about_covid_pred'})
df_zeroshot['article_id'] = df_zeroshot['article_id'].astype(int)
df_zeroshot['about_covid_pred'] = df_zeroshot['about_covid_pred'].astype(int)

df_zeroshot.head()

Unnamed: 0,article_id,about_covid_pred
0,2322751,1
1,2325123,1
2,2326923,1
3,2328119,1
4,2328418,1


In [None]:
df_elif = df[df['coder'] == 'Elif Kilik']
df_coded_elif = df_elif[['article_id', 'about_covid']]
df_zeroshot_merged_elif = pd.merge(df_coded_elif, df_zeroshot, how='left', on="article_id")
print(len(df_zeroshot_merged_elif))

df_zeroshot_merged_elif = df_zeroshot_merged_elif.dropna()
df_zeroshot_merged_elif['about_covid_pred'] = df_zeroshot_merged_elif['about_covid_pred'].astype(int)

In [48]:
print(classification_report(df_zeroshot_merged_elif['about_covid'], df_zeroshot_merged_elif['about_covid_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_zeroshot_merged_elif['about_covid'], df_zeroshot_merged_elif['about_covid_pred']))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        42
           1       0.97      0.97      0.97        78

    accuracy                           0.97       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.97      0.97      0.97       120

Cohen's Kappa: 0.9267399267399268
