In [1]:
# from huggingface_hub import login

# login()

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
from datasets import Dataset

from lib.hard_coded_constants import GENERATED_HEADLINES_FILE_NAME, DATA_FILE_NAME, BASE_LANGUAGE_MODEL, NEWS_SITES_BASE_URL, NEWS_CATEGORIES

import pandas as pd
import torch
import re
import math

base_model_checkpoint = BASE_LANGUAGE_MODEL
base_model_tokenizer = AutoTokenizer.from_pretrained(base_model_checkpoint, use_fast=True)

networks = list(NEWS_SITES_BASE_URL.keys())
BLOCK_SIZE = 128
STOP_CHAR = base_model_tokenizer.eos_token

if base_model_tokenizer.pad_token is None:
    base_model_tokenizer.pad_token = base_model_tokenizer.eos_token



In [3]:
def fine_tuned_model_name(network):
    model_name = BASE_LANGUAGE_MODEL.split("/")[-1]
    fine_tuned_model = f"{model_name}-finetuned-{network}"
    return fine_tuned_model

In [4]:
headlines_df = pd.read_csv(DATA_FILE_NAME)
headlines_df = headlines_df[headlines_df["news_category"] == "politics"]
headlines_df = headlines_df[["network", "headline"]]

training_datasets = {}
testing_headlines = {}
for network in networks:
    # 80/20 training testing split
    training_df, testing_df = train_test_split(headlines_df[headlines_df["network"] == network], test_size=0.2, random_state=123)
    training_df = training_df[["headline"]]
    # add stop char so models have a sense for when headlines end
    training_df["headline"] = training_df["headline"].astype(str) + f" {STOP_CHAR}"
    
    training_data = Dataset.from_pandas(training_df, preserve_index=False)

    # have models generate a complete headline based on the first 4 words in each testing headline
    testing_df["first_4_words_in_headline"] = testing_df["headline"].str.split().str[:4].str.join(" ")
    headlines = list(testing_df["first_4_words_in_headline"].unique())

    print(f"{network} split into {len(training_data)} headlines to fine-tune and {len(headlines)} headline beginnings to generate new headlines")

    training_datasets[network] = training_data
    testing_headlines[network] = headlines


Breitbart split into 3731 headlines to fine-tune and 924 headline beginnings to generate new headlines
Fox split into 5405 headlines to fine-tune and 1334 headline beginnings to generate new headlines
MSNBC split into 1274 headlines to fine-tune and 319 headline beginnings to generate new headlines
Newsmax split into 6840 headlines to fine-tune and 1667 headline beginnings to generate new headlines
Nypost split into 5380 headlines to fine-tune and 1343 headline beginnings to generate new headlines
NYT split into 5052 headlines to fine-tune and 1240 headline beginnings to generate new headlines
USAToday split into 3220 headlines to fine-tune and 797 headline beginnings to generate new headlines
Washpost split into 2715 headlines to fine-tune and 668 headline beginnings to generate new headlines
WSJ split into 1647 headlines to fine-tune and 412 headline beginnings to generate new headlines


In [5]:
def tokenize_function(headlines):
    return base_model_tokenizer(headlines["headline"])

In [6]:
#I can do this with dictionary comprehension but I think it is more readable this way
#tokenizing all the headlines
tokenized_datasets = {}
for network in networks:
    tokenized_datasets[network] = training_datasets[network].map(tokenize_function, batched=True, num_proc=4, remove_columns=["headline"])

Map (num_proc=4):   0%|          | 0/3731 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5405 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1274 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6840 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5380 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5052 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3220 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2715 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1647 [00:00<?, ? examples/s]

In [7]:
#grouping tokenized headlines together for the trainer, adding some padding to make sure all the lengths are the same
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    num_chunks = math.ceil(total_length / BLOCK_SIZE)
    
    # Split by chunks of max_len.
    result = {k: [] for k in concatenated_examples.keys()}
    result["labels"] = []
    
    for i in range(num_chunks):
        for k, t in concatenated_examples.items():
            chunk = t[i * BLOCK_SIZE : (i + 1) * BLOCK_SIZE]
            # pad if chunk is shorter than BLOCK_SIZE
            if len(chunk) < BLOCK_SIZE:
                pad_len = BLOCK_SIZE - len(chunk)
                chunk += [base_model_tokenizer.pad_token_id] * pad_len
            result[k].append(chunk)
        
        label_chunk = result["input_ids"][-1].copy()
        #setting padding labels to -100 (-100 will be automatically ignore by PyTorch loss functions)
        label_chunk = [token if token != base_model_tokenizer.pad_token_id else -100 for token in label_chunk]
        result["labels"].append(label_chunk)

    return result

In [8]:
lm_datasets = {}
for network in networks:
    lm_datasets[network] = tokenized_datasets[network].map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=4,
    )

Map (num_proc=4):   0%|          | 0/3731 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5405 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1274 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6840 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5380 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5052 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3220 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2715 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1647 [00:00<?, ? examples/s]

In [9]:
for network in networks:
    base_model = AutoModelForCausalLM.from_pretrained(base_model_checkpoint)

    training_args = TrainingArguments(
        fine_tuned_model_name(network),
        learning_rate=2e-5,
        weight_decay=0.01,
        # push_to_hub=True,
    )
    trainer = Trainer(
        model=base_model,
        args=training_args,
        tokenizer=base_model_tokenizer,
        train_dataset=lm_datasets[network]
    )
    trainer.train()
    # trainer.push_to_hub()

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


Step,Training Loss


In [10]:
generated_headline_dfs = []

for network in networks:
    model_name = "franzhanz/" + fine_tuned_model_name(network)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"running on device: {device}")
    model.eval()

    tokenizer.pad_token = tokenizer.eos_token

    headlines = testing_headlines[network]
    print(f"generating {len(headlines)} headlines for {network}")

    batch_size = 32

    num_batches = (len(headlines) + batch_size - 1) // batch_size
    print(f"generating {len(headlines)} headlines in {num_batches} batches")

    generated_headlines = []
    for i in range(num_batches):
        print(f"starting batch {i + 1}/{num_batches}")

        start = i * batch_size
        end = min(start + batch_size, len(headlines))
        batch_headlines = headlines[start:end]

        inputs = tokenizer(
            batch_headlines,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=64
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.9,
            temperature=0.9
        )

        results = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        results = [re.sub(r'\s+', ' ', r).strip() for r in results]
        generated_headlines += results

    print(f"finished {network}")
    generated_headlines = [headline.strip() for headline in generated_headlines]

    #format the generated headlines for a csv
    generated_df = pd.DataFrame({"headline": generated_headlines})
    generated_df["year"] = 2025
    generated_df["month"] = 10
    generated_df["day"] = 10
    generated_df["network"] = network
    generated_df["url"] = "generated_headline"
    generated_df["news_category"] = "politics"

    generated_headline_dfs.append(generated_df)

pd.concat(generated_headline_dfs).to_csv(GENERATED_HEADLINES_FILE_NAME, index=False)


running on device: cpu
generating 924 headlines for Breitbart
generating 924 headlines in 29 batches
starting batch 1/29
starting batch 2/29
starting batch 3/29
starting batch 4/29
starting batch 5/29
starting batch 6/29
starting batch 7/29
starting batch 8/29
starting batch 9/29
starting batch 10/29
starting batch 11/29
starting batch 12/29
starting batch 13/29
starting batch 14/29
starting batch 15/29
starting batch 16/29
starting batch 17/29
starting batch 18/29
starting batch 19/29
starting batch 20/29
starting batch 21/29
starting batch 22/29
starting batch 23/29
starting batch 24/29
starting batch 25/29
starting batch 26/29
starting batch 27/29
starting batch 28/29
starting batch 29/29
finished Breitbart
running on device: cpu
generating 1334 headlines for Fox
generating 1334 headlines in 42 batches
starting batch 1/42
starting batch 2/42
starting batch 3/42
starting batch 4/42
starting batch 5/42
starting batch 6/42
starting batch 7/42
starting batch 8/42
starting batch 9/42
sta

In [11]:
from lib.sentiment_analysis import find_pos_neg_neu_sentiment, find_emotion_sentiment

for network in networks:
    find_pos_neg_neu_sentiment(network, 2025, 10, 10, use_generated_headlines=True)
    find_emotion_sentiment(network, 2025, 10, 10, use_generated_headlines=True)

finding pos_neg_neu sentiment for Breitbart headlines on 2025-10-10
added 924 pos_neg_neu sentiments for Breitbart headlines on 2025-10-10
finding emotion sentiment for Breitbart headlines on 2025-10-10
running on device: cpu
processing 924 headlines in 29 batches
starting batch 1/29
starting batch 2/29
starting batch 3/29
starting batch 4/29
starting batch 5/29
starting batch 6/29
starting batch 7/29
starting batch 8/29
starting batch 9/29
starting batch 10/29
starting batch 11/29
starting batch 12/29
starting batch 13/29
starting batch 14/29
starting batch 15/29
starting batch 16/29
starting batch 17/29
starting batch 18/29
starting batch 19/29
starting batch 20/29
starting batch 21/29
starting batch 22/29
starting batch 23/29
starting batch 24/29
starting batch 25/29
starting batch 26/29
starting batch 27/29
starting batch 28/29
starting batch 29/29
Finished processing 924 headlines
finding pos_neg_neu sentiment for Fox headlines on 2025-10-10
added 1334 pos_neg_neu sentiments for F