In [1]:
!pip freeze | egrep -w "transformers|datasets|torch"

datasets==2.19.0
sentence-transformers==2.3.1
torch==2.2.1
torch-grammar==0.3.3
transformers==4.40.1


# Load model and tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

modelpath="models/llama3-8b"

model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)

tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False) 

def generate(prompt, max_new_tokens = 100): 
    prompt_tokenized=tokenizer(prompt, return_tensors="pt").to("cuda")
    output_tokenized = model.generate(
        **prompt_tokenized, 
        max_new_tokens = max_new_tokens,
        pad_token_id = tokenizer.eos_token_id)[0]
    
    output_tokenized = output_tokenized[len(prompt_tokenized["input_ids"][0]):]
    output = tokenizer.decode(output_tokenized)

    return output

# Dataset
https://huggingface.co/datasets/m-newhauser/senator-tweets

In [None]:
from datasets import load_dataset
import json

ds = load_dataset("m-newhauser/senator-tweets")
ds

In [None]:
ds["train"]

In [None]:
ds = ds.remove_columns("embeddings")

print(json.dumps(ds["train"][0], indent=2))

# Define answer scheme, few-shot prompts and prompt for a single entry

In [None]:
def few_shot_prompt(dataset):
    few_shot_prompt = []
    
    for entry in dataset:
        few_shot_prompt.append("\nInput:")
        few_shot_prompt.append(json.dumps(dict(text=entry["text"]),indent = 2))
        few_shot_prompt.append("Analysis:")
        few_shot_prompt.append(json.dumps(dict(political_party=entry["party"]),indent = 2))

    return "\n".join(few_shot_prompt).strip()

print(few_shot_prompt(ds["train"].select(range(3))))

In [None]:
def entry_to_prompt(entry):
    prompt = []
    prompt.append("Input:")
    prompt.append(json.dumps(dict(text=entry["text"]),indent = 2))
    prompt.append("Analysis:")

    return "\n".join(prompt).strip()

## 1. Predict political party of tweet author

In [None]:
# 5 shot prompting
examples = few_shot_prompt(ds["train"].select(range(5)))

# test llama on 100 tweets
dataset = ds["train"].select(range(11, 111))

total, correct, unknown = 0, 0, 0

for i, entry in enumerate(dataset):
    prompt = examples + "\n\n" + entry_to_prompt(entry) + "\n"
    if i==0:
        print("#" * 30,f"\n{prompt}\n","#" * 30)
    
    answer = generate(prompt)
    print(f"\nTweet {i+1} (id {entry['id']}): '{entry['text']}'")

    # Cut answer at the point where llama hallucinates the next tweet
    answer = answer.split("Input:")[0] if "Input:" in answer else answer
    print(answer)

    # parse JSON
    answer_parsed = json.loads(answer)
    answer_party = answer_parsed["political_party"].lower()

    # count and output num. of correct and incorrect answers 
    is_correct = answer_party == entry["party"].lower()
    correct += 1 if is_correct else 0
    unknown += 1 if answer_party =="unknown" else 0
    total += 1
    print(" *",f"Prediction ({answer_party}) is","correct" if is_correct else f"WRONG ({entry['party']} is the correct answer)")
    print(" *","Stats: ", f"{round(correct/total*100,2)}% correct, ({total} total, {correct} correct, {unknown} unknown)")

### 5 shot prompting
**with analysis scheme** in prompt:
* Stats:  90.0% correct, (100 total, 90 correct, 0 unknown)

**without analysis scheme** in prompt:
* Stats:  91.0% correct, (100 total, 91 correct, 0 unknown)

### 10 shot prompting
**without analysis scheme** in prompt:
* Stats:  86.0% correct, (100 total, 86 correct, 0 unknown)
* Stats:  89.0% correct, (100 total, 89 correct, 0 unknown)
Selection deleted

## 2. Extract topic, entities, keywords from tweets

### Semi-manual curated 5-shot prompt

In [None]:
examples="""
Input:
{
  "text": "The crisis and chaos on the southern border are inexcusable. This was avoidable, but the Biden administration's open border policies and disastrous refusal to secure the border are responsible for this failure. https://t.co/7IwKDYdmTo"
}
Analysis:
{
  "party": "Republican",
  "topic": "Border Crisis",
  "actors": ["Biden"],
  "location": ["southern border"],
  "strong_words": ["crisis", "chaos", "inexcusable", "disastrous refusal", "failure"],
  "hashtags": ["#BorderChaos", "#BidenBorderDisaster", "#FailedImmigrationPolicy"]
}

Input:
{
  "text": "All nations should condemn the Chinese Communist Party's human rights abuses against its ethnic minorities, including Uyghurs and Tibetan people. We must hold the Chinese Communist Party accountable for these atrocities. https://t.co/11ovy05Ilp"
}
Analysis:
{
  "party": "Democrat",
  "topic": "Human Rights Abuses",
  "actors": ["Chinese Communist Party", "Uyghurs and Tibetan people"],
  "location": [ ],
  "strong_words": ["abuses", "accountable", "atrocities"],
  "hashtags": ["#CCPHumanRightsAbuse", "#UyghurGenocide", "#TibetanOppression"]
}

Input:
{
  "text": "Congress enacted two bipartisan sanctions laws to stop Putin's Nord Stream pipeline, but @POTUS is giving up and  hiding behind ridiculous talking points. If Biden Admin actually imposed sanctions that it's waiving, that would halt #NS2's completion for foreseeable future. https://t.co/M5TISJBpEX"
}
Analysis:
{
  "party": "Republican",
  "topic": "Nord Stream Sanctions",
  "actors": ["Congress", "Putin", "POTUS", "Biden"],
  "location": [ ],
  "strong_words": ["giving up", "ridiculous"],
  "hashtags": ["#BidenCavesToPutin", "#NordStreamSanctions", "#FailedForeignPolicy"]
}

Input:
{
  "text": "I'm introducing a bill with @SenRubioPress and @SenatorMenendez to help end violence against women and children in El Salvador, Honduras and Guatemala and provide victims with the support and shelter they need. More: https://t.co/OBpqOXsioQ"
}
Analysis:
{
  "party": "Democrat",
  "topic": "Ending Violence",
  "actors": ["SenRubioPress", "SenatorMenendez", "women", "children"],
  "location": ["El Salvador", "Honduras", "Guatemala"],
  "strong_words": ["end violence", "support", "shelter"],
  "hashtags": ["#EndGenderViolence", "#CentralAmericaAid", "#SupportSurvivors"]
}

Input:
{
  "text": "Kelley and I are heartbroken at the news of the tragic death of the FBI agents today. Our prayers go out to their families during this time. https://t.co/YeDilcKgL8 https://t.co/We2FgfOP62"
}
Analysis:
{
  "party": "Republican",
  "topic": "FBI Agents Deaths",
  "actors": ["Kelley", "FBI agents"],
  "location": [ ],
  "strong_words": ["heartbroken", "tragic death", "prayers"],
  "hashtags": ["#FBITragedy", "#PrayersForFamilies", "#LawEnforcementSacrifice"]
}
"""

### Prompt llama again

In [None]:
total, correct, unknown = 0, 0, 0

dataset = ds["train"].select(range(11, 111))

for i, entry in enumerate(dataset):
    print(f"\nTweet {i+1} (id {entry['id']}): '{entry['text']}'")

    prompt = examples + "\n\n" + entry_to_prompt(entry) + "\n"
    answer = generate(prompt, max_new_tokens=250)
    answer = answer.split("Input:")[0].strip() if "Input:" in answer else answer.strip()
    
    print(answer)