# Load Packages & Set Working Directory

In [1]:
# packages

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import csv
import json
import re
import simpledorff
import pandas as pd
import transformers
from transformers import AutoTokenizer
from transformers import  LlamaForCausalLM, LlamaTokenizer, pipeline
import transformers

import torch
from torch import cuda, bfloat16, manual_seed

from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.llms import HuggingFacePipeline

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, cohen_kappa_score
from sklearn.metrics import classification_report


In [2]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [3]:
torch.clear_autocast_cache()

In [4]:
os.getcwd()

# go one level up in the directory
os.chdir("/data/storage100gb5/NOS")

huggingface_cache_dir = 'model'

# change huggingface cache
os.environ['TRANSFORMERS_CACHE'] = huggingface_cache_dir

# Load the Model from Huggingface

In [5]:
torch.manual_seed(0)

model_id = 'berkeley-nest/Starling-LM-7B-alpha'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, need auth token for these
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    cache_dir=huggingface_cache_dir
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    cache_dir=huggingface_cache_dir
)
model.eval()

print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=huggingface_cache_dir)

Loading checkpoint shards: 100%|██████████| 3/3 [01:06<00:00, 22.23s/it]


Model loaded on cuda:0


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
device = torch.device('cuda')
print("GPU Name:", torch.cuda.get_device_name(device))
print("Memory Usage:", torch.cuda.memory_allocated(device) / 1024 ** 3, "GB")
print("Max Memory Usage:", torch.cuda.max_memory_allocated(device) / 1024 ** 3, "GB")

GPU Name: NVIDIA A10
Memory Usage: 2.3108439445495605 GB
Max Memory Usage: 2.329235076904297 GB


In [7]:
torch.manual_seed(0)
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.5,  # 'randomness' of outputs, 0.0 is not possible, so use a very small number
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  
)

llm = HuggingFacePipeline(pipeline=generate_text)

## Helper functions

In [8]:
def regex_extract_to_dataframe(strings):
    # Initialize empty lists to store extracted values
    article_ids = []
    about_covid_values = []

    # Define regex pattern for article_id and about_covid with optional double quotes
    article_id_pattern = r'"article_id"\s*:\s*"?(\d+)"?'
    about_covid_pattern = r'"about_covid"\s*:\s*"?(\d)"?'

    # Iterate through each string
    for string_data in strings:
        # Use regex to find matches for article_id
        article_id_match = re.search(article_id_pattern, string_data)

        # Use regex to find matches for about_covid
        about_covid_match = re.search(about_covid_pattern, string_data)

        # Extract values from the regex matches
        article_id = int(article_id_match.group(1)) if article_id_match else None
        about_covid = int(about_covid_match.group(1)) if about_covid_match else None

        # Append values to the respective lists
        article_ids.append(article_id)
        about_covid_values.append(about_covid)

    # Create a DataFrame using the extracted values
    df = pd.DataFrame({
        "article_id": article_ids,
        "about_covid": about_covid_values
    })

    return df


In [9]:
def zero_shot_prompt_messages(system_prompt, input_prompt, main_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": input_prompt + "\n" + main_prompt},
    ]
    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

In [10]:
def few_shot_prompt_messages(system_prompt, input_prompt, main_prompt, examples):
    messages = []
    messages.append({"role": "system", "content": system_prompt})

    for user_prompt, assistant_prompt in examples:
        messages.append({"role": "user", "content": user_prompt})
        messages.append({"role": "assistant", "content": assistant_prompt + "\n"})

    messages.append({"role": "user", "content": input_prompt + "\n" + main_prompt})

    prompt = generate_text.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

# Data Prep

## Get Annotated NOS Articles DF

In [11]:
df = pd.read_csv('data/nos_analysis/nos_llm_analysis_final.csv',
                 sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df['article_id'] = df['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n', 'topic_o']

# change all topic vars to int
for i in topic_vars:
    df[i] = df[i].astype(int)

# remove line break
df['Text'] = df['Text'].str.replace('[LINE_BREAK]', '\n ')

print(df.shape)
df.head()


(659, 23)


Unnamed: 0,article_id,Text,about_covid,topic_a,topic_b,topic_c,topic_d,topic_e,topic_f,topic_g,...,topic_k,topic_l,topic_m,topic_n,topic_o,other_country_binary,actors_present,Category,Keywords,country_name
0,2321843,Minister Bruins voor Medische Zorg vindt het v...,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0.0,1.0,Politiek,"Bruno Bruins, coronavirus, Corinne Ellemeet",
1,2328614,"Niet meer naar school, niet meer naar de kroeg...",1,0,1,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,"NOS op 3, Binnenland","RIVM, corona",
2,2405920,Het Amerikaanse automerk Canoo dat elektrische...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,"L1mburg, Regionaal nieuws, Economie","Born, Canoo",
3,2341446,"Viroloog Marion Koopmans, een van de belangrij...",1,1,1,0,0,0,0,0,...,0,0,1,0,0,0.0,1.0,"Nieuwsuur, Binnenland","Nieuwsuur, coronavirus",
4,2326455,Scholen en ouders in Noord-Brabant proberen ui...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,1.0,"Regionaal nieuws, Binnenland","Noord-Brabant, coronavirus",


In [12]:
# are there duplicate articles? 
duplicates = df[df.duplicated(subset=['article_id'], keep=False)]
duplicates

Unnamed: 0,article_id,Text,about_covid,topic_a,topic_b,topic_c,topic_d,topic_e,topic_f,topic_g,...,topic_k,topic_l,topic_m,topic_n,topic_o,other_country_binary,actors_present,Category,Keywords,country_name


In [13]:
len(df)

659

In [14]:
df.Text.values[0]

'Minister Bruins voor Medische Zorg vindt het verschrikkelijk dat mensen met een Aziatisch uiterlijk worden gediscrimineerd vanwege het coronavirus. Hij deed in de Tweede Kamer een oproep aan iedereen om hiertegen op te staan.\n Bruins reageerde op vragen van onder anderen GroenLinks-Kamerlid Ellemeet. Zij zei dat mensen met Aziatisch uiterlijk op grote schaal worden gediscrimineerd. Ze hoorde bijvoorbeeld van een meisje dat mensen in de tram hun trui over hun mond trokken toen ze haar zagen.\n Ellemeet vroeg de minister of hij zich hierover duidelijk wil uitspreken. Bruins zei hierop dat hij dit de komende dagen nog verschillende keren wil doen, ook buiten de Tweede Kamer.\n De minister zei dat hij het er zeer mee eens is dat dit niet bij een fatsoenlijke samenleving hoort. "Mensen discrimineren gaat niet aan. We moeten ervoor zorgen dat het niet optreedt. Daar hebben wij allemaal een rol in."\n Bruins is niet van plan om evenementen en attracties te sluiten die veel Chinese toeristen

## Get Examples 

Randomly selected examples from the annotated dataset, that is not part of the dev or test sets

In [15]:
df_covid = pd.read_csv('data/nos_analysis/examples/about_covid.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
df_covid['article_id'] = df_covid['article_id'].astype(int)

topic_vars = ['about_covid',  'topic_a', 'topic_b', 'topic_c', 'topic_d', 'topic_e', 'topic_f', 'topic_g', 'topic_h', 
              'topic_i', 'topic_j', 'topic_k', 'topic_l', 'topic_m', 'topic_n', 'topic_o']

# change all topic vars to int
for i in topic_vars:
    df_covid[i] = df_covid[i].astype(int)

# remove line break
df_covid['Text'] = df_covid['Text'].str.replace('[LINE_BREAK]', '\n ')

print(df_covid.shape)


df_covid.head()

(2, 44)


Unnamed: 0,coder,article_id,title,owner,about_covid,topic_a,topic_b_old,topic_c,topic_d,topic_e,...,num_tokens,keyword_check,keyword_wordcount,wekdienst,Title_lower,uitzending,page_type,page_type2,page_type_final,text_length
0,Valentina Veltkamp,2327099,"Twee nieuwe corona-doden Nederland, 155 geregi...",NOS,1,1,0.0,0,0,0,...,365.0,"corona, virus, besmettingen, rivm",4.0,False,"twee nieuwe corona-doden nederland, 155 geregi...",False,artikel,2327099-twee-nieuwe-corona-doden-nederland-155...,artikel,1273.0
1,Valentina Veltkamp,2452901,Toch persoonsgegevens beschikbaar voor vervolg...,NOS,1,0,0.0,0,0,0,...,303.0,"oversterfte, corona, covid19",3.0,False,toch persoonsgegevens beschikbaar voor vervolg...,False,artikel,2452901-toch-persoonsgegevens-beschikbaar-voor...,artikel,1109.0


In [16]:
print(df_covid.article_id.value_counts())
print(df_covid.Text.values)
print(df_covid.Keywords.values)
print(df_covid.Category.values)

article_id
2327099    1
2452901    1
Name: count, dtype: int64
['Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.\n Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.\n Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de daling van vandaag het gevolg van het veranderde testbeleid.\n "Sinds 12 maart worden mensen met milde klachten niet meer getest omdat er een landelijke maatregel is om thuis te blijven bij de eerste klachten. Daarnaast wordt er meer getest onder risicogroepen", zegt de instantie.\n Op 12 maart riep de overheid Nederlanders op om sociale contacten te vermijden. Het is volgens het RIVM nog te vroeg om het effect daarvan te zien.\n In het hele land was het vandaag op allerlei plekken veel rustiger dan anders op zaterdag:\n De cijfers zeggen niet alles over het

In [17]:
not_covid = pd.read_csv('data/nos_analysis/examples/not_covid.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

# change article_id to integer
not_covid['article_id'] = not_covid['article_id'].astype(int)

# change all topic vars to int
for i in topic_vars:
    not_covid[i] = not_covid[i].astype(int)

# remove line break
not_covid['Text'] = not_covid['Text'].str.replace('[LINE_BREAK]', '\n ')

print(not_covid.shape)
not_covid.head()

(2, 44)


Unnamed: 0,coder,article_id,title,owner,about_covid,topic_a,topic_b_old,topic_c,topic_d,topic_e,...,num_tokens,keyword_check,keyword_wordcount,wekdienst,Title_lower,uitzending,page_type,page_type2,page_type_final,text_length
0,Valentina Veltkamp,2401789,Vrijspraak voor man die vaccinatielocatie Den ...,NOS,0,0,0.0,0,0,0,...,397.0,vaccinatielocatie,1.0,False,vrijspraak voor man die vaccinatielocatie den ...,False,artikel,2401789-vrijspraak-voor-man-die-vaccinatieloca...,artikel,1378.0
1,Valentina Veltkamp,2446472,NZa: plotse toename wachtrijen in langdurige zorg,NOS,0,0,0.0,0,0,0,...,339.0,zorg,1.0,False,nza: plotse toename wachtrijen in langdurige zorg,False,artikel,2446472-nza-plotse-toename-wachtrijen-in-langd...,artikel,1053.0


In [18]:
not_covid.article_id.value_counts()

article_id
2401789    1
2446472    1
Name: count, dtype: int64

In [236]:
print(not_covid.article_id.value_counts())
print(not_covid.Text.values)
print(not_covid.Keywords.values)
print(not_covid.Category.values)

article_id
2401789    1
2446472    1
Name: count, dtype: int64
['De rechtbank heeft een man uit Den Helder vrijgesproken van het voorbereiden van een terroristisch misdrijf bij een coronavaccinatielocatie in zijn woonplaats.\n Het Openbaar Ministerie (OM) had 30 maanden celstraf tegen de man geëist vanwege het voorbereiden van een terroristische aanslag. \n Volgens de rechter zijn er sterke aanwijzingen dat de 38-jarige man in maart dit jaar concrete plannen had om met een vuurwerkbom brand te stichten in het oude stadhuis van Den Helder, waar een vaccinatielocatie was ingericht. Dat het om een uit de hand gelopen grap ging, zoals de verdachte tijdens de zitting verklaarde, gelooft de rechtbank niet.\n De rechtbank spreekt de man vrij, omdat niet kan worden vastgesteld dat hij met zijn plannen een terroristisch oogmerk had, schrijft NH Nieuws. "Er is geen bewijs dat hij een tegenstander was van de vaccinatiecampagne", zegt de rechter, "en het is niet zeker dat hij eerder deelnam aan ge

## Prompt building

In [39]:
system_prompt = """
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.
"""

input_prompt = """
Read the following article with the ID {article_id}: {text} \n
This article falls under the categories: {category} and contains the keywords: {keywords}.
"""

main_prompt = """
Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 0 if it is another subject.

Output your results in JSON format with keys "article_id" and "about_covid", where the article ID and your answer are the values. 
Follow the example output format provided. Do not include any additional information or explanation. \n

Example Output (JSON format):
{{
    "article_id": "2351150",
    "about_covid": "1"
}}
"""

In [57]:
one_shot_example =[
("""
Read the following article with the ID 2327099: 'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.\n Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.\n Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de daling van vandaag het gevolg van het veranderde testbeleid.\n "Sinds 12 maart worden mensen met milde klachten niet meer getest omdat er een landelijke maatregel is om thuis te blijven bij de eerste klachten. Daarnaast wordt er meer getest onder risicogroepen", zegt de instantie.\n Op 12 maart riep de overheid Nederlanders op om sociale contacten te vermijden. Het is volgens het RIVM nog te vroeg om het effect daarvan te zien.\n In het hele land was het vandaag op allerlei plekken veel rustiger dan anders op zaterdag:\n De cijfers zeggen niet alles over het totale aantal besmettingen in Nederland. Alleen ernstig zieke mensen worden nog getest, zodat er voldoende testmateriaal voor riskante gevallen beschikbaar blijft. Daarnaast zijn er volgens de GGD waarschijnlijk veel mensen met milde klachten die thuis uitzieken en zich niet melden.' \n
This article falls under the categories: 'Binnenland' and contains the keywords: 'virus, besmettingen, RIVM, corona'.

Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 0 if it is another subject.

Output your results in JSON format with keys "article_id" and "about_covid", where the article ID and your answer are the values. 
Follow the example output format provided. Do not include any additional information or explanation. \n

Example Output (JSON format):
{{
    "article_id": "2351150",
    "about_covid": "1"
}}
""", 
"""
{{
    "article_id": "2327099",
    "about_covid": "1"
}}
""")]

In [58]:
few_shot_examples = [
("""
Read the following article with the ID 2327099: 'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.\n Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.\n Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de daling van vandaag het gevolg van het veranderde testbeleid.\n "Sinds 12 maart worden mensen met milde klachten niet meer getest omdat er een landelijke maatregel is om thuis te blijven bij de eerste klachten. Daarnaast wordt er meer getest onder risicogroepen", zegt de instantie.\n Op 12 maart riep de overheid Nederlanders op om sociale contacten te vermijden. Het is volgens het RIVM nog te vroeg om het effect daarvan te zien.\n In het hele land was het vandaag op allerlei plekken veel rustiger dan anders op zaterdag:\n De cijfers zeggen niet alles over het totale aantal besmettingen in Nederland. Alleen ernstig zieke mensen worden nog getest, zodat er voldoende testmateriaal voor riskante gevallen beschikbaar blijft. Daarnaast zijn er volgens de GGD waarschijnlijk veel mensen met milde klachten die thuis uitzieken en zich niet melden.' \n
This article falls under the categories: 'Binnenland' and contains the keywords: 'virus, besmettingen, RIVM, corona'.

Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 0 if it is another subject.

Output your results in JSON format with keys "article_id" and "about_covid", where the article ID and your answer are the values. 
Follow the example output format provided. Do not include any additional information or explanation. \n

Example Output (JSON format):
{{
    "article_id": "2351150",
    "about_covid": "1"
}}
""", 
"""
{{
    "article_id": "2327099",
    "about_covid": "1"
}}
"""),
("""
Read the article with ID 2446472: 'De wachtrijen in de langdurige zorg zijn in juli fors gestegen. De toename was afgelopen jaar niet eerder zo groot, meldt de Nederlandse Zorgautoriteit (NZa) in haar maandelijkse overzicht.\n Het aantal mensen dat moet wachten op zorg vanuit de Wet langdurige zorg (Wlz) steeg van 21.653 op 1 juli naar 23.497 in augustus. De toename deed zich voor in alle drie de sectoren: verpleeg- en verzorgingstehuizen, gehandicaptenzorg en langdurige geestelijke gezondheidszorg. De NZa weet geen oorzaak voor de plotselinge stijging.\n De afgelopen tijd is het ook veel gegaan over het ziekteverzuim in de zorg. De NZa ziet dat die nog steeds in alle sectoren hoog is, maar er is nu wel een lichte daling zichtbaar in het kortdurend ziekteverzuim.\n Het langdurige ziekteverzuim is gelijk gebleven. Het ziekteverzuim in combinatie met personeelstekorten baart de zorgtoezichthouder zorgen, met name in de langdurige zorg. Er zijn vaak genoeg bedden beschikbaar, maar er is niet altijd genoeg personeel om de langdurige zorg te leveren.' \n
This article falls under the categories: 'Binnenland' and contains the keywords: 'verpleeg- en verzorgingshuizen, Wet Langdurige Zorg, langdurige zorg, ziekenhuiszorg'.

Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or the COVID-19 pandemic", and a value of 0 if it is another subject.

Output your results in JSON format with keys "article_id" and "about_covid", where the article ID and your answer are the values. 
Follow the example output format provided. Do not include any additional information or explanation. \n

Example Output (JSON format):
{{
    "article_id": "2351150",
    "about_covid": "1"
}}
""",
"""
{{
    "article_id": "2446472",
    "about_covid": "0"
}}
""")
]

In [44]:
not_covid[not_covid['article_id'] == 2399596].Text.values

array([], dtype=object)

In [45]:
not_covid.head()

Unnamed: 0,coder,article_id,title,owner,about_covid,topic_a,topic_b_old,topic_c,topic_d,topic_e,...,num_tokens,keyword_check,keyword_wordcount,wekdienst,Title_lower,uitzending,page_type,page_type2,page_type_final,text_length
0,Valentina Veltkamp,2401789,Vrijspraak voor man die vaccinatielocatie Den ...,NOS,0,0,0.0,0,0,0,...,397.0,vaccinatielocatie,1.0,False,vrijspraak voor man die vaccinatielocatie den ...,False,artikel,2401789-vrijspraak-voor-man-die-vaccinatieloca...,artikel,1378.0
1,Valentina Veltkamp,2446472,NZa: plotse toename wachtrijen in langdurige zorg,NOS,0,0,0.0,0,0,0,...,339.0,zorg,1.0,False,nza: plotse toename wachtrijen in langdurige zorg,False,artikel,2446472-nza-plotse-toename-wachtrijen-in-langd...,artikel,1053.0


# About Covid

## Zero-Shot Classifier About_Covid

In [40]:
zero_shot_prompt = zero_shot_prompt_messages(system_prompt, input_prompt, main_prompt)
print(zero_shot_prompt)

<s>GPT4 Correct System: 
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.
<|end_of_turn|>GPT4 Correct User: 
Read the following article with the ID {article_id}: {text} 

This article falls under the categories: {category} and contains the keywords: {keywords}.


Take a moment to understand the article. 
Remember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. 

Based on the information provided, determine if the main topic of this news article is the "Coronavirus and/or the COVID-19 pandemic" or another subject. 
Assign a value of 1 if the main topic is the "Coronavirus and/or

In [41]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=zero_shot_prompt
)

In [42]:
chain_one = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, about_covid")
chain_one

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s>GPT4 Correct System: \nAs a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.\nA main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.\n<|end_of_turn|>GPT4 Correct User: \nRead the following article with the ID {article_id}: {text} \n\nThis article falls under the categories: {category} and contains the keywords: {keywords}.\n\n\nTake a moment to understand the article. \nRemember, for a topic to be a main topic of the news article, it should be discussed in the majority of the article. \n\nBased on the information provided, determine if the main topic of this news article is the "Coronaviru

In [43]:
generated_text_zeroshot = []

In [44]:
%%time
torch.manual_seed(0)

for index, row in df.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
    # Generate text using the chain
    generated_text = chain_one.run(input_variables)
    print(generated_text)
    generated_text_zeroshot.append(generated_text)    

 {
    "article_id": "2321843",
    "about_covid": "0"
}
 {
    "article_id": "2328614",
    "about_covid": "1"
}
 {
    "article_id": "2405920",
    "about_covid": "0"
}
 {
    "article_id": "2341446",
    "about_covid": "1"
}
 {
    "article_id": "2326455",
    "about_covid": "1"
}
 {
    "article_id": "2339047",
    "about_covid": "1"
}
 {
    "article_id": "2324064",
    "about_covid": "1"
}
 {
    "article_id": "2335085",
    "about_covid": "0"
}
 {
    "article_id": "2330601",
    "about_covid": "1"
}
 {
    "article_id": "2425483",
    "about_covid": "0"
}
 {
    "article_id": "2324557",
    "about_covid": "1"
}
 {
    "article_id": "2379607",
    "about_covid": "1"
}
 {
    "article_id": "2445063",
    "about_covid": "0"
}
 {
    "article_id": "2360263",
    "about_covid": "0"
}
 {
    "article_id": "2354947",
    "about_covid": "1"
}
 {
    "article_id": "2432503",
    "about_covid": "0"
}
 {
    "article_id": "2387434",
    "about_covid": "0"
}
 {
    "article_id": "2345541",

In [45]:
generated_text_zeroshot[0:5]

[' {\n    "article_id": "2321843",\n    "about_covid": "0"\n}',
 ' {\n    "article_id": "2328614",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2405920",\n    "about_covid": "0"\n}',
 ' {\n    "article_id": "2341446",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2326455",\n    "about_covid": "1"\n}']

In [46]:
json_list = regex_extract_to_dataframe(generated_text_zeroshot)

print(len(json_list))
df_zeroshot = pd.DataFrame(json_list)
print(len(df_zeroshot))
df_zeroshot.head()

659
659


Unnamed: 0,article_id,about_covid
0,2321843,0
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [47]:
df_zeroshot['about_covid'].value_counts()

about_covid
1    379
0    280
Name: count, dtype: int64

In [48]:
# rename about_covid to about_covid_pred
df_zeroshot = df_zeroshot.rename(columns={'about_covid': 'about_covid_pred'})
df_zeroshot['article_id'] = df_zeroshot['article_id'].astype(int)
df_zeroshot['about_covid_pred'] = df_zeroshot['about_covid_pred'].astype(int)

df_zeroshot.head()

Unnamed: 0,article_id,about_covid_pred
0,2321843,0
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [49]:
# get real article_id and about_covid
df_coded = df[['article_id', 'about_covid']]
df_coded.head()

Unnamed: 0,article_id,about_covid
0,2321843,1
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [50]:
# merge
df_zeroshot_merged = pd.merge(df_coded, df_zeroshot, how='left', on="article_id")
print(len(df_zeroshot_merged))
df_zeroshot_merged.head()

# make pred int
# drop nan
df_zeroshot_merged = df_zeroshot_merged.dropna()
df_zeroshot_merged['about_covid_pred'] = df_zeroshot_merged['about_covid_pred'].astype(int)

659


In [51]:
pd.crosstab(df_zeroshot_merged['about_covid'], df_zeroshot_merged['about_covid_pred'])

about_covid_pred,0,1
about_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,262,31
1,18,348


In [52]:
print(classification_report(df_zeroshot_merged['about_covid'], df_zeroshot_merged['about_covid_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_zeroshot_merged['about_covid'], df_zeroshot_merged['about_covid_pred']))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91       293
           1       0.92      0.95      0.93       366

    accuracy                           0.93       659
   macro avg       0.93      0.92      0.92       659
weighted avg       0.93      0.93      0.93       659

Cohen's Kappa: 0.8487732230584423


In [53]:
# write df to results 
df_zeroshot_merged.to_csv('data/nos_analysis/results/about_covid_starling_zeroshot_v4.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

In [54]:
# change df shape wher
humancoded = df_zeroshot_merged[['article_id', 'about_covid']]
humancoded['coder']='human'
humancoded
machinecoded = df_zeroshot_merged[['article_id', 'about_covid_pred']]
machinecoded['coder']='machine'
machinecoded.rename(columns={'about_covid_pred': 'about_covid'}, inplace=True)

df_zeroshot_merged_krip = pd.concat([humancoded, machinecoded])

In [55]:
zeroshot_krip = simpledorff.calculate_krippendorffs_alpha_for_df(df_zeroshot_merged_krip,experiment_col='article_id',
                                                 annotator_col='coder',
                                                 class_col='about_covid')

print("Krippendorff's Alpha:", zeroshot_krip)

Krippendorff's Alpha: 0.8488281387258864


## One-Shot Classifier About_Covid

In [60]:
one_shot_prompt = few_shot_prompt_messages(system_prompt, input_prompt, main_prompt, one_shot_example)
print(one_shot_prompt)

<s>GPT4 Correct System: 
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.
<|end_of_turn|>GPT4 Correct User: 
Read the following article with the ID 2327099: 'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.
 Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.
 Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de daling van vandaag het gevolg van het veranderde testbeleid.
 "Sinds 12 maart worden mensen met milde klachten niet m

In [61]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=one_shot_prompt
)

In [62]:
chain_2 = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, about_covid")
chain_2         

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s>GPT4 Correct System: \nAs a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.\nA main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.\n<|end_of_turn|>GPT4 Correct User: \nRead the following article with the ID 2327099: \'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.\n Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.\n Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de dalin

In [63]:
generated_text_oneshot = []

In [64]:
%%time
torch.manual_seed(0)
for index, row in df.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
    # Generate text using the chain
    generated_text = chain_2.run(input_variables)
    generated_text_oneshot.append(generated_text)    


CPU times: user 19min 34s, sys: 4min 40s, total: 24min 15s
Wall time: 24min 13s


In [81]:
generated_text_oneshot[0:5]

[' {\n    "article_id": "2321843",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2328614",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2405920",\n    "about_covid": "0"\n}',
 ' {\n    "article_id": "2341446",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2326455",\n    "about_covid": "1"\n}']

In [82]:
json_list = regex_extract_to_dataframe(generated_text_oneshot)

print(len(json_list))
df_oneshot = pd.DataFrame(json_list)
print(len(df_oneshot))
df_oneshot.head()

659
659


Unnamed: 0,article_id,about_covid
0,2321843,1
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [83]:
df_oneshot['about_covid'].value_counts()

about_covid
1    353
0    306
Name: count, dtype: int64

In [84]:
# rename about_covid to about_covid_pred
df_oneshot = df_oneshot.rename(columns={'about_covid': 'about_covid_pred'})

df_oneshot.head()

Unnamed: 0,article_id,about_covid_pred
0,2321843,1
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [85]:
#change article_id to int
df_oneshot['article_id'] = df_oneshot['article_id'].astype(int)
df_coded['article_id'] = df_coded['article_id'].astype(int)

df_oneshot['about_covid_pred'] = df_oneshot['about_covid_pred'].astype(int)
df_coded['about_covid'] = df_coded['about_covid'].astype(int)


In [86]:
# get real article_id and about_covid
df_coded = df[['article_id', 'about_covid']]
df_coded.head()

# merge df_zeroshot_v2 with df on index
df_oneshot_merged = pd.merge(df_coded, df_oneshot, how='left', on="article_id")
print(len(df_oneshot_merged))
df_oneshot_merged.head()

659


Unnamed: 0,article_id,about_covid,about_covid_pred
0,2321843,1,1
1,2328614,1,1
2,2405920,0,0
3,2341446,1,1
4,2326455,1,1


In [87]:
pd.crosstab(df_oneshot_merged['about_covid'], df_oneshot_merged['about_covid_pred'])

about_covid_pred,0,1
about_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,272,21
1,34,332


In [88]:
print(classification_report(df_oneshot_merged['about_covid'], df_oneshot_merged['about_covid_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_oneshot_merged['about_covid'], df_oneshot_merged['about_covid_pred']))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       293
           1       0.94      0.91      0.92       366

    accuracy                           0.92       659
   macro avg       0.91      0.92      0.92       659
weighted avg       0.92      0.92      0.92       659

Cohen's Kappa: 0.8317511895091099


In [89]:
# change df shape wher
humancoded = df_oneshot_merged[['article_id', 'about_covid']]
humancoded['coder']='human'
humancoded
machinecoded = df_oneshot_merged[['article_id', 'about_covid_pred']]
machinecoded['coder']='machine'
machinecoded.rename(columns={'about_covid_pred': 'about_covid'}, inplace=True)

df_oneshot_merged_krip = pd.concat([humancoded, machinecoded])

In [90]:
oneshot_krip = simpledorff.calculate_krippendorffs_alpha_for_df(df_oneshot_merged_krip,experiment_col='article_id',
                                                 annotator_col='coder',
                                                 class_col='about_covid')

print("Krippendorff's Alpha:", oneshot_krip)

Krippendorff's Alpha: 0.8318128731009726


In [91]:
# write df to results 
df_oneshot_merged.to_csv('data/nos_analysis/results/about_covid_starling_oneshot_v4.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)

## Few-Shot Classifier About_Covid

In [92]:
few_shot_prompt = few_shot_prompt_messages(system_prompt, input_prompt, main_prompt, few_shot_examples)
print(few_shot_prompt)

<s>GPT4 Correct System: 
As a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.
A main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.
<|end_of_turn|>GPT4 Correct User: 
Read the following article with the ID 2327099: 'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.
 Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.
 Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de daling van vandaag het gevolg van het veranderde testbeleid.
 "Sinds 12 maart worden mensen met milde klachten niet m

In [93]:
prompt_template = PromptTemplate(
    input_variables=["article_id", "text", "category", "keywords"],
    template=few_shot_prompt
)

In [94]:
chain_3 = LLMChain(llm = llm, prompt = prompt_template, output_key="article_id, about_covid")
chain_3

LLMChain(prompt=PromptTemplate(input_variables=['article_id', 'category', 'keywords', 'text'], template='<s>GPT4 Correct System: \nAs a helpful AI assistant, your task is to determine the main topic of news articles. Articles may focus on either the "Coronavirus and/or the COVID-19 pandemic" or some other topic.\nA main topic is the overarching theme discussed in the majority of the news article. For an article to have the main topic of the "Coronavirus and/or the COVID-19 pandemic", it should predominantly discuss these subjects.\n<|end_of_turn|>GPT4 Correct User: \nRead the following article with the ID 2327099: \'Sinds gisteren zijn 155 nieuwe mensen positief getest op het coronavirus. Dat brengt het totaal op 959 patiënten, meldt het RIVM.\n Er zijn twee nieuwe mensen aan de gevolgen van het virus overleden. Het gaat om ouderen met onderliggend lijden. In totaal staat het dodental in Nederland nu op twaalf.\n Gisteren werden 190 nieuwe patiënten gemeld. Volgens het RIVM is de dalin

In [95]:
generated_text_fewshot = []

In [96]:
%%time
torch.manual_seed(0)
for index, row in df.iterrows():  
    article_id = row['article_id']
    text = row['Text']
    category = row['Category']
    keywords = row['Keywords']


    input_variables = {
            "article_id": article_id,
            "text": text,
            "category": category,
            "keywords": keywords
        }
    # Generate text using the chain
    generated_text = chain_3.run(input_variables)
    generated_text_fewshot.append(generated_text)    


CPU times: user 20min 21s, sys: 6min 9s, total: 26min 30s
Wall time: 26min 29s


In [97]:
generated_text_fewshot[0:5]

[' {\n    "article_id": "2321843",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2328614",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2405920",\n    "about_covid": "0"\n}',
 ' {\n    "article_id": "2341446",\n    "about_covid": "1"\n}',
 ' {\n    "article_id": "2326455",\n    "about_covid": "1"\n}']

In [98]:
json_list = regex_extract_to_dataframe(generated_text_fewshot)

print(len(json_list))
df_fewshot = pd.DataFrame(json_list)
print(len(df_fewshot))
df_fewshot.head()

659
659


Unnamed: 0,article_id,about_covid
0,2321843,1
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [99]:
df_fewshot['about_covid'].value_counts()

about_covid
1    384
0    275
Name: count, dtype: int64

In [100]:
# rename about_covid to about_covid_pred
df_fewshot = df_fewshot.rename(columns={'about_covid': 'about_covid_pred'})

df_fewshot.head()

Unnamed: 0,article_id,about_covid_pred
0,2321843,1
1,2328614,1
2,2405920,0
3,2341446,1
4,2326455,1


In [101]:
# get real article_id and about_covid
df_coded = df[['article_id', 'about_covid']]
df_coded.head()

# merge df_zeroshot_v2 with df on index
df_fewshot_merged = pd.merge(df_coded, df_fewshot, how='left', on="article_id")
print(len(df_fewshot_merged))
df_fewshot_merged.head()

659


Unnamed: 0,article_id,about_covid,about_covid_pred
0,2321843,1,1
1,2328614,1,1
2,2405920,0,0
3,2341446,1,1
4,2326455,1,1


In [102]:
# is there nan
df_fewshot_merged.isnull().sum()

# drop if about_covid_pred is null
df_fewshot_merged=df_fewshot_merged[df_fewshot_merged['about_covid_pred'].notnull()]
df_fewshot_merged['about_covid_pred'] = df_fewshot_merged['about_covid_pred'].astype(int)

In [103]:
pd.crosstab(df_fewshot_merged['about_covid'], df_fewshot_merged['about_covid_pred'])

about_covid_pred,0,1
about_covid,Unnamed: 1_level_1,Unnamed: 2_level_1
0,255,38
1,20,346


In [104]:
print(classification_report(df_fewshot_merged['about_covid'], df_fewshot_merged['about_covid_pred']))
print("Cohen's Kappa:", cohen_kappa_score(df_fewshot_merged['about_covid'], df_fewshot_merged['about_covid_pred']))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       293
           1       0.90      0.95      0.92       366

    accuracy                           0.91       659
   macro avg       0.91      0.91      0.91       659
weighted avg       0.91      0.91      0.91       659

Cohen's Kappa: 0.8206903669509575


In [105]:
# change df shape wher
humancoded = df_fewshot_merged[['article_id', 'about_covid']]
humancoded['coder']='human'
humancoded
machinecoded = df_fewshot_merged[['article_id', 'about_covid_pred']]
machinecoded['coder']='machine'
machinecoded.rename(columns={'about_covid_pred': 'about_covid'}, inplace=True)

df_fewshot_merged_krip = pd.concat([humancoded, machinecoded])

In [106]:
fewshot_krip = simpledorff.calculate_krippendorffs_alpha_for_df(df_fewshot_merged_krip,experiment_col='article_id',
                                                 annotator_col='coder',
                                                 class_col='about_covid')

print("Krippendorff's Alpha:", fewshot_krip)

Krippendorff's Alpha: 0.8206901408450704


In [107]:
# write df to results 
df_fewshot_merged.to_csv('data/nos_analysis/results/about_covid_starling_fewshot_v4.csv', sep = ';', encoding = 'utf-8', quoting=csv.QUOTE_NONNUMERIC)