In [None]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import csv
import re
import pandas as pd
import transformers
import torch

from torch import cuda, bfloat16
from transformers import AutoTokenizer

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import classification_report, cohen_kappa_score


In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
os.getcwd()

# go one level up in the directory
os.chdir("/data/500gbstorage/")

huggingface_cache_dir = 'model_starling'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

# Load the Model from Huggingface

In [5]:
torch.manual_seed(0)

model_id = 'berkeley-nest/Starling-LM-7B-alpha'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    cache_dir=huggingface_cache_dir
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    cache_dir=huggingface_cache_dir
)
model.eval()

print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=huggingface_cache_dir)

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded on cuda:0


In [6]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A100 80GB PCIe
Memory Usage: 3.96816349029541 GB
Max Memory Usage: 3.968377113342285 GB


In [7]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

## Helper functions

In [8]:
def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    article_ids = []
    subtopic_discussed_values = []

    # Define regex pattern for article_id and about_covid with optional double quotes
    article_id_pattern = r'"article_id"\s*:\s*"?(\d+)"?'
    subtopic_discussed_pattern = r'"subtopic_discussed"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # Use regex to find matches for article_id
        article_id_match = re.search(article_id_pattern, string_data)

        # Use regex to find matches for about_covid
        subtopic_discussed_match = re.search(subtopic_discussed_pattern, string_data)

        # Extract values from the regex matches
        article_id = int(article_id_match.group(1)) if article_id_match else None
        subtopic_discussed = int(subtopic_discussed_match.group(1)) if subtopic_discussed_match else None

        # Append values to the respective lists
        article_ids.append(article_id)
        subtopic_discussed_values.append(subtopic_discussed)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "article_id": article_ids,
        "subtopic_discussed": subtopic_discussed_values
    })

    return df


In [9]:
def zero_shot_prompt_messages(system_prompt, input_prompt, main_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": input_prompt + main_prompt},
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

# Data Prep

In [None]:
# Get the NOS articles annotated by the researcher
df = pd.read_csv('NOS/nos_analysis/topic_tests_reliability_data/reliability_topics_elif.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

df = df[df['about_covid'] == 1]

In [None]:
# articles df including text, category, keywords 
articles_df = pd.read_csv('NOS/nos_analysis/data/final_nosarticles.csv',
                          sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)
print(articles_df.shape)

# get article text, category, keywords and page_id
articles_df = articles_df[['page_id', 'Text', 'Category', 'Keywords']].drop_duplicates()
# make page id integer
articles_df['page_id'] = articles_df['page_id'].astype(int)
# change page_id to article_id
articles_df.rename(columns = {'page_id': 'article_id'}, inplace = True)

In [None]:
# merge articles_df with df
df = pd.merge(df, articles_df, on='article_id', how = 'left')

In [None]:
# unique articles from researcher annotations
unique_articles = df[['article_id', 'Keywords', 'Category', 'Text']].drop_duplicates()
print(unique_articles.shape)

# Prompt Topic C

In [19]:
system_prompt = """
You are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article substantially discusses the subtopic "Coronavirus (COVID-19) tests and testing procedures".  
Substantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. 

"""

input_prompt = """
Read the following article with the ID {article_id}: {text}. \n
This article falls under the categories: {category} and contains the keywords: {keywords}.
"""

main_prompt = """
Take a moment to understand the article. 

Remember, for a subtopic to be substantially discussed, the article must discuss one or more aspects of the subtopic in at least two sentences.

Carefully analyze if the article contains substantial discussion of "Coronavirus (COVID-19) tests and testing procedures" based on the definition above.
 
Assign a value of 1 if the article substantially discusses the subtopic, and a value of 0 if the article does not substantially discuss the subtopic.

Output your results in a JSON format with keys 'article_id' and 'subtopic_discussed', where the id of the article and and your answer are the values. 
Follow the example output format provided. Do not include any additional information or explanation.

Example Output (JSON format):
{{
    "article_id": "2000000",
    "subtopic_discussed": "1"
}}
"""

# Zero-Shot - Topic C

In [22]:
zero_shot_prompt_c = zero_shot_prompt_messages(system_prompt, input_prompt, main_prompt)
print(zero_shot_prompt_c)

<s>GPT4 Correct System: 
You are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article substantially discusses the subtopic "Coronavirus (COVID-19) tests and testing procedures".  
Substantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. 

<|end_of_turn|>GPT4 Correct User: 
Read the following article with the ID {article_id}: {text}. 

This article falls under the categories: {category} and contains the keywords: {keywords}.

Take a moment to understand the article. 

Remember, for a subtopic to be substantially discussed, the article must discuss one or more aspects of the subtopic in at least two sentences.

Carefully analyze if the article contains substantial discussion of "Coronavirus (COVID-19) tests and testing procedures" based on the definition above.
 
Assign a value of 1 if the article substantially discusses the subtopic, and a value of 0 if the article does 

In [23]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=zero_shot_prompt_c
)

In [24]:
chain_c = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, subtopic_discussed")
chain_c

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s>GPT4 Correct System: \nYou are a helpful AI assistant trained to analyze news articles. Your task is to determine whether the given news article substantially discusses the subtopic "Coronavirus (COVID-19) tests and testing procedures".  \nSubstantial discussion means the article discusses one or more aspects of the subtopic in at least two sentences. \n\n<|end_of_turn|>GPT4 Correct User: \nRead the following article with the ID {article_id}: {text}. \n\nThis article falls under the categories: {category} and contains the keywords: {keywords}.\n\nTake a moment to understand the article. \n\nRemember, for a subtopic to be substantially discussed, the article must discuss one or more aspects of the subtopic in at least two sentences.\n\nCarefully analyze if the article contains substantial discussion of "Coronavirus (COVID-19) tests and testing procedures" based on the definition a

In [25]:
generated_text_zeroshot_c = []

In [None]:
%%time
torch.manual_seed(0)
for index, row in unique_articles.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
        
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot_c.append(generated_text)  

# CPU times: 3min 24s, sys: 19.2 s, total: 3min 44s
# Wall time: 3min 44s 
# output cleaned

In [None]:
json_list = regex_extract_to_dataframe(generated_text_zeroshot_c)
df_topic_c = pd.DataFrame(json_list)

In [None]:
df_topic_c = df_topic_c.rename(columns={'topic_discussed': 'topic_c_pred'})
df_topic_c['article_id'] = df_topic_c['article_id'].astype(int)
df_topic_c['topic_c_pred'] = df_topic_c['topic_c_pred'].astype(int)

In [None]:
df_elif = df[df['coder'] == 'Elif Kilik']
df_coded_elif = df_elif[['article_id', 'topic_c']]
df_zeroshot_merged_elif = pd.merge(df_coded_elif, df_topic_c, how='left', on="article_id")
df_zeroshot_merged_elif = df_zeroshot_merged_elif.dropna()
df_zeroshot_merged_elif['topic_c_pred'] = df_zeroshot_merged_elif['topic_c_pred'].astype(int)

In [38]:
print(classification_report(df_zeroshot_merged_elif['topic_c'], df_zeroshot_merged_elif['topic_c_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_zeroshot_merged_elif['topic_c'], df_zeroshot_merged_elif['topic_c_pred']))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95        66
           1       1.00      0.42      0.59        12

    accuracy                           0.91        78
   macro avg       0.95      0.71      0.77        78
weighted avg       0.92      0.91      0.89        78

Cohen's Kappa: 0.5472636815920398
