# Imports and Downloads

In [1]:
!pip install regex



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [70]:
import os
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
import regex
import re

#PATH = '/content/drive/MyDrive/habrok'
ROOT_FOLDER = Path('/content/drive/MyDrive/habrok')
sys.path.append(str(ROOT_FOLDER))

In [4]:
from data_tools.dataset_loaders import load_dataset_classification, load_span_annotations_from_json, load_texts_and_ids_from_json
from settings import TRAIN_DATASET_EN, TEST_DATASET_EN
from data_tools.span_data_definitions import SPAN_LABELS_OFFICIAL, NONE_LABEL
from data_tools.spacy_utils import SpacyURLTokenizer, create_spacy_model

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
from transformers import LlamaForCausalLM, LlamaTokenizer
from evaluation.oppositional_evaluator import evaluate_task1

In [6]:
from google.colab import userdata
hf_token = userdata.get('hf_token')

In [28]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, load_in_8bit=True, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16, use_auth_token=hf_token)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Prompting
Run sequentially

## Test

In [61]:
'''T: The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One , Cat: CRITICAL
T: WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK , Cat: CRITICAL
T: Jeff Bezos Takes Private Jet to COP26 for Climate Change Chat with Prince Charles , Cat: CRITICAL
T: San Jose Mayor : W.H. Is Wrong That Pandemic Is Root Cause of Smash and Grab Crime , Cat: CRITICAL
T: A man in New Zealand was SHOT - yet his death was written down as COVID - 19 ! ! ! , Cat: CRITICAL
T: Rand Paul ' Stars ' in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic , Cat: CONSPIRACY
T: Why have so many African leaders died since the beginning of the pseudo - pandemic " ? ? ? , Cat: CONSPIRACY
T: The Deepstate Alliances . This is just one , but it is one that led to [ many ] others . [ Mooooooon ] , Cat: CONSPIRACY
T: Notice what the eagle has in it 's talons . The tentacles of an Octopus . Symbolism for the Deepstate in everything . , Cat: CONSPIRACY
T: Exclusive — Jim Breuer on Refusing Vaccine Mandate Comedy Clubs : ' I Will Never ' Be ' Controlled ' by ' Evil Agenda ' , Cat: CONSPIRACY'''

# shortest 5 conspiracy 5 critical

'T: The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One , Cat: CRITICAL\nT: WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK , Cat: CRITICAL\nT: Jeff Bezos Takes Private Jet to COP26 for Climate Change Chat with Prince Charles , Cat: CRITICAL\nT: San Jose Mayor : W.H. Is Wrong That Pandemic Is Root Cause of Smash and Grab Crime , Cat: CRITICAL\nT: A man in New Zealand was SHOT - yet his death was written down as COVID - 19 ! ! ! , Cat: CRITICAL\nT: Rand Paul \' Stars \' in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic , Cat: CONSPIRACY\nT: Why have so many African leaders died since the beginning of the pseudo - pandemic " ? ? ? , Cat: CONSPIRACY\nT: The Deepstate Alliances . This is just one , but it is one that led to [ many ] others . [ Mooooooon ] , Cat: CONSPIRACY\nT: Notice what the eagle has in it \'s talons . The tentacles of an Octopus . Symbolism for the Deepstate in everything . , Cat: CONSPIRACY\nT: E

In [66]:
# only for testing
prompt = '''
[INST] <<SYS>>
Classify the text into conspiracy or critical
<</SYS>>

The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
critical
[INST] Rand Paul  Stars  in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
conspiracy
[INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
critical
[INST] Exclusive — Jim Breuer on Refusing Vaccine Mandate Comedy Clubs : ' I Will Never ' Be ' Controlled ' by ' Evil Agenda  [/INST]
'''

In [67]:
print(get_reponse(prompt, max_new_tokens=200))


[INST] <<SYS>>
Classify the text into conspiracy or critical
<</SYS>>

The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
critical
[INST] Rand Paul ' Stars ' in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
conspiracy
[INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
critical
[INST] Exclusive — Jim Breuer on Refusing Vaccine Mandate Comedy Clubs : ' I Will Never ' Be ' Controlled ' by ' Evil Agenda  [/INST]
conspiracy


## Few Shot No Feature Hints

In [93]:
def get_prompt(text):
  max_model_tokens = 4096
  reserved_prompt_tokens = 1000

  truncated_text = truncate_text(text, tokenizer, max_model_tokens, reserved_prompt_tokens)

  prompt = f'''
  [INST] <<SYS>>
  Classify the text into conspiracy or critical. Only label.
  <</SYS>>

  The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
  critical
  [INST] Rand Paul  Stars  in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
  conspiracy
  [INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
  critical
  [INST] {truncated_text} [/INST]
  '''
  return prompt

In [130]:
def get_reponse(prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, temperature= 0.07)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [89]:
def truncate_text(text, tokenizer, max_tokens, reserved_tokens=100):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    truncated_tokens = tokens[:max_tokens - reserved_tokens]
    truncated_text = tokenizer.decode(truncated_tokens, skip_special_tokens=True)
    return truncated_text

In [124]:
def get_pred(prompt):
  response = get_reponse(prompt, max_new_tokens=200)
  #print("Model Response:", response)
  last_word = response.strip().split()[-1].lower()
  if last_word == "critical":
    return 1
  elif last_word == "conspiracy":
    return 0
  else:
    raise ValueError("couldn't get label")

In [122]:
#df_sampled_ids = df.sample(n=50, random_state=42).index.tolist()
#print(df_sampled_ids)
df_sampled_ids = [521, 737, 740, 660, 411, 678, 626, 513, 859, 136, 811, 76, 636, 973, 938, 899, 280, 883, 761, 319, 549, 174, 371, 527, 210, 235, 101, 986, 902, 947, 346, 139, 621, 499, 370, 198, 687, 584, 901, 59, 328, 96, 312, 974, 299, 277, 924, 601, 439, 837]
df_sample = df.loc[df_sampled_ids]
df_sample['id'] = df_sample.index

In [107]:
#test_sampled_ids = df.sample(n=5, random_state=42).index.tolist()
#print(test_sampled_ids)
test_sampled_ids = [521, 737, 740, 660, 411]
test_sample_small = df.loc[test_sampled_ids]
test_sample_small['id'] = test_sample_small.index

In [131]:
predictions = []

for id, row in df_sample.iterrows():
    text = row['preprocessed_text']
    gold_label = row['category']

    prompt = get_prompt(text)
    predicted_label = get_pred(prompt)

    predicted_category = "CRITICAL" if predicted_label == 1 else "CONSPIRACY"

    predictions.append({"id": str(row['id']), "category": predicted_category})

In [132]:
predictions_path = "predictions.json"
with open(predictions_path, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=4)

In [133]:
gold_path = "gold.json"
gold_labels = [{"id": str(row['id']), "category": row['category'].upper()} for _, row in df_sample.iterrows()]
with open(gold_path, "w", encoding="utf-8") as f:
    json.dump(gold_labels, f, indent=4)

In [134]:
results = evaluate_task1(predictions_path, gold_path, verbose=True)
print("Evaluation Results:", results)

MCC: 0.277, F1 (macro): 0.448, F1 (conspi): 0.581, F1 (critical): 0.316
Evaluation Results: {'MCC': 0.27695585470349865, 'F1-macro': 0.44821731748726656, 'F1-conspiracy': 0.5806451612903226, 'F1-critical': 0.3157894736842105}


# Few Shot Emotion Hint

In [138]:
def get_prompt_emotion(text):
  max_model_tokens = 4096
  reserved_prompt_tokens = 1000

  truncated_text = truncate_text(text, tokenizer, max_model_tokens, reserved_prompt_tokens)

  prompt = f'''
  [INST] <<SYS>>
  Classify the text into conspiracy or critical. Note that conspiracists tend to be more emotional, especially angry and hateful. Respond only with label.
  <</SYS>>

  The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
  critical
  [INST] Rand Paul  Stars  in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
  conspiracy
  [INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
  critical
  [INST] {truncated_text} [/INST]
  '''
  return prompt

In [139]:
predictions = []

for id, row in df_sample.iterrows():
    text = row['preprocessed_text']
    gold_label = row['category']

    prompt = get_prompt_emotion(text)
    predicted_label = get_pred(prompt)

    predicted_category = "CRITICAL" if predicted_label == 1 else "CONSPIRACY"

    predictions.append({"id": str(row['id']), "category": predicted_category})

In [140]:
predictions_path = "predictions.json"
with open(predictions_path, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=4)

gold_path = "gold.json"
gold_labels = [{"id": str(row['id']), "category": row['category'].upper()} for _, row in df_sample.iterrows()]
with open(gold_path, "w", encoding="utf-8") as f:
    json.dump(gold_labels, f, indent=4)

results = evaluate_task1(predictions_path, gold_path, verbose=True)
print("Evaluation Results:", results)

MCC: 0.221, F1 (macro): 0.392, F1 (conspi): 0.562, F1 (critical): 0.222
Evaluation Results: {'MCC': 0.2211629342323457, 'F1-macro': 0.3923611111111111, 'F1-conspiracy': 0.5625, 'F1-critical': 0.2222222222222222}


# Few Shot Personality Hint

In [141]:
def get_prompt_personality(text):
  max_model_tokens = 4096
  reserved_prompt_tokens = 1000

  truncated_text = truncate_text(text, tokenizer, max_model_tokens, reserved_prompt_tokens)

  prompt = f'''
  [INST] <<SYS>>
  Classify the text into conspiracy or critical. Conspiracists tend to be more neurotic, conscientious, and open. Respond only with label.
  <</SYS>>

  The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
  critical
  [INST] Rand Paul  Stars  in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
  conspiracy
  [INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
  critical
  [INST] {truncated_text} [/INST]
  '''
  return prompt

In [142]:
predictions = []

for id, row in df_sample.iterrows():
    text = row['preprocessed_text']
    gold_label = row['category']

    prompt = get_prompt_personality(text)
    predicted_label = get_pred(prompt)

    predicted_category = "CRITICAL" if predicted_label == 1 else "CONSPIRACY"

    predictions.append({"id": str(row['id']), "category": predicted_category})

In [143]:
predictions_path = "predictions.json"
with open(predictions_path, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=4)

gold_path = "gold.json"
gold_labels = [{"id": str(row['id']), "category": row['category'].upper()} for _, row in df_sample.iterrows()]
with open(gold_path, "w", encoding="utf-8") as f:
    json.dump(gold_labels, f, indent=4)

results = evaluate_task1(predictions_path, gold_path, verbose=True)
print("Evaluation Results:", results)

MCC: 0.221, F1 (macro): 0.392, F1 (conspi): 0.562, F1 (critical): 0.222
Evaluation Results: {'MCC': 0.2211629342323457, 'F1-macro': 0.3923611111111111, 'F1-conspiracy': 0.5625, 'F1-critical': 0.2222222222222222}


## Few Shot Causal Relations Hint

In [144]:
def get_prompt_causal(text):
  max_model_tokens = 4096
  reserved_prompt_tokens = 1000

  truncated_text = truncate_text(text, tokenizer, max_model_tokens, reserved_prompt_tokens)

  prompt = f'''
  [INST] <<SYS>>
  Classify the text into conspiracy or critical. Conspiracists express more causal relations in their language. Respond only with label.
  <</SYS>>

  The Taliban Has No Plan to Stop Coronavirus – and No One Is Asking It for One [/INST]
  critical
  [INST] Rand Paul  Stars  in Chinese Rap Video Accusing U.S. of Starting Coronavirus Pandemic [/INST]
  conspiracy
  [INST] WATCH : Jen Psaki Mocks Free At - Home COVID Testing , Already the Norm in UK [/INST]
  critical
  [INST] {truncated_text} [/INST]
  '''
  return prompt

In [145]:
predictions = []

for id, row in df_sample.iterrows():
    text = row['preprocessed_text']
    gold_label = row['category']

    prompt = get_prompt_causal(text)
    predicted_label = get_pred(prompt)

    predicted_category = "CRITICAL" if predicted_label == 1 else "CONSPIRACY"

    predictions.append({"id": str(row['id']), "category": predicted_category})

In [146]:
predictions_path = "predictions.json"
with open(predictions_path, "w", encoding="utf-8") as f:
    json.dump(predictions, f, indent=4)

gold_path = "gold.json"
gold_labels = [{"id": str(row['id']), "category": row['category'].upper()} for _, row in df_sample.iterrows()]
with open(gold_path, "w", encoding="utf-8") as f:
    json.dump(gold_labels, f, indent=4)

results = evaluate_task1(predictions_path, gold_path, verbose=True)
print("Evaluation Results:", results)

MCC: 0.107, F1 (macro): 0.299, F1 (conspi): 0.537, F1 (critical): 0.061
Evaluation Results: {'MCC': 0.10714285714285714, 'F1-macro': 0.2989597467209407, 'F1-conspiracy': 0.5373134328358209, 'F1-critical': 0.06060606060606061}
