In [1]:
import json
import torch
import random
import numpy as np
import pandas as pd

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

random.seed(42)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
models = ['mistralai/Mistral-7B-Instruct-v0.2', 'CohereForAI/c4ai-command-r-v01-4bit','meta-llama/Llama-2-7b-chat-hf', 'databricks/dbrx-instruct']

In [5]:
frames = "{1:Economic - costs, benefits, or other financial implications,\
    2:Capacity and resources - availability of physical, human, or financial resources, and capacity of current systems, \
    3:Morality - religious or ethical implications,\
    4:Fairness and equality - balance or distribution of rights, responsibilities, and resources,\
    5:Legality, constitutionality and jurisprudence - rights, freedoms, and authority of individuals, corporations, and government,\
    6:Policy prescription and evaluation - discussion of specific policies aimed at addressing problems,\
    7:Crime and punishment - effectiveness and implications of laws and their enforcement,\
    8:Security and defense - threats to welfare of the individual, community, or nation,\
    9:Health and safety - health care, sanitation, public safety,\
    10:Quality of life - threats and opportunities for the individual’s wealth, happiness, and well-being,\
    11:Cultural identity - traditions, customs, or values of a social group in relation to a policy issue,\
    12:Public opinion - attitudes and opinions of the general public, including polling and demographics,\
    13:Political - considerations related to politics and politicians, including lobbying, elections, and attempts to sway voters,\
    14:External regulation and reputation - international reputation or foreign policy of the U.S,\
    15:Other - any coherent group of frames not covered by the above categories}"

In [6]:
sys_prompt = f"You are a journalism scholar doing framing analysis of news articles.\
    Framing is defined as selecting and highlighting some facets of events or issues, and making connections among them so as to promote a particular interpretation, evaluation, and/or solution.\
    A dictionary of generic frames with a frame_id, frame_name and its description is: {frames}.\
    Your task is to code articles for one of the listed frames and provide reasoning for it. Format your output in a json format with fields 'frame_id', 'frame_name', and 'reasoning'. "
    
user_prompt = "Output the generic frame and the reasoning for the article below.\n"

messages = [
    {"role": "system", "content": sys_prompt},
    {"role": "user", "content": user_prompt}
 ]

In [None]:
for model_name in models:

    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/projects/copenlu/data/models/")
    
    print("----------------------")
    print(model_name)

    model = AutoModelForCausalLM.from_pretrained(model_name,
            load_in_4bit=True,
            quantization_config=bnb_config,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
            cache_dir="/projects/copenlu/data/models/")
        
    tokenizer.pad_token = tokenizer.eos_token

    data_df = pd.read_csv("../data/processed/mfc_consolidated.csv")

    article_json = {}

    for index, row in data_df.iterrows():
        text = row['clean_text']
        messages[1]['content'] = text

        if 'mistral' in model_name:
            inputs = tokenizer.apply_chat_template([{"role":"user","content":sys_prompt + user_prompt + text}], tokenize=True, add_generation_prompt=True,return_tensors="pt")
        else:
            inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True,return_tensors="pt")
        # inputs = tokenizer.encode(prompt + text + format_prompt, return_tensors="pt", padding=True).to(device)

        try:
            # outputs = model.generate(inputs, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, max_new_tokens=5000, pad_token_id=tokenizer.pad_token_id)
            outputs = model.generate(inputs, max_new_tokens=5000, pad_token_id=tokenizer.pad_token_id)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            print(generated_text)
            break
            # Extract the json from the generated string
            generated_text = generated_text.split(format_prompt)[1]
            article_json = json.loads(generated_text)
            article_json["text"] = text
            article_json["label"] = row['label']
            article_json["topic"] = row['topic']
            article_json["id"] = index

            with open(f"./data/annotated/mfc_annotated.json", "a") as f:
                json.dump(article_json, f)
                f.write("\n")

        except Exception as e:
            print("Skipped- ", e)
            continue