In [1]:
import pandas as pd
import ast
import random
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import sacrebleu
import nltk
# from comet import download_model, load_from_checkpoint  # For COMET

# Ensure that NLTK's resources are downloaded (needed for METEOR)
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Question: {question}\nAnswer: """
prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")
# model = OllamaLLM(model="deepseek-r1")
llm = prompt | model

In [3]:
def evaluate_headline_performance(df, index, generated_headline):
    # Extract ground truth headline from DataFrame
    ground_truth = df["Headline"][index]

    # Tokenize the ground truth and generated headlines
    ground_truth_tokens = word_tokenize(ground_truth)
    generated_headline_tokens = word_tokenize(generated_headline)

    # METEOR Score calculation with tokenized inputs
    meteor_score = nltk.translate.meteor_score.single_meteor_score(
        reference=ground_truth_tokens, 
        hypothesis=generated_headline_tokens
    )

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(target=ground_truth, prediction=generated_headline)
    # Calculate the average F-measure
    f_measures = [
        rouge_scores['rouge1'].fmeasure,
        rouge_scores['rouge2'].fmeasure,
        rouge_scores['rougeL'].fmeasure
    ]
    average_f_measure = sum(f_measures) / len(f_measures)

    # BLEU
    bleu_score = sacrebleu.raw_corpus_bleu([generated_headline], [[ground_truth]], .01).score

    # Return all scores
    return bleu_score, meteor_score, rouge_scores, average_f_measure

In [4]:
# path = "/Users/chinonsoosuji/Downloads/archive/PENS/personalization/pers_preprocessed.csv"
path = "/Users/CYNTHIA/Desktop/Somto_Project/archive/personalization/pers_preprocessed.csv"
df = pd.read_csv(path, sep='\t', on_bad_lines='skip')
df['context'] = df['context'].apply(ast.literal_eval)
df.head()

Unnamed: 0.1,Unnamed: 0,userID,clicknewsID,posnewID,rewrite_titles,context,News body,Category,Topic,Headline,Title entity
0,0,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N24110,Legal battle looms over Trump EPA's rule chang...,[Nike faces backlash after pulling 'Betsy Ross...,Democratic state attorney generals and environ...,news,newspolitics,High-stakes legal fight looms over Trump pollu...,{'Trump': 'Donald Trump'}
1,1,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N62769,Wise choices for stylish updating of old homes,[Nike faces backlash after pulling 'Betsy Ross...,We love old houses. Their architectural styles...,lifestyle,lifestylehomeandgarden,The One Thing That Immediately Makes Your Hous...,{}
2,2,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N36186,Verlander may be reconsidering his stance on M...,[Nike faces backlash after pulling 'Betsy Ross...,Justin Verlander made headlines earlier in the...,sports,baseball_mlb,Justin Verlander got 'chewed out' by MLB befor...,"{'Verlander': 'Justin Verlander', 'MLB': 'Nati..."
3,3,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N101669,Infamous o.j. Simpson launching official Twitt...,[Nike faces backlash after pulling 'Betsy Ross...,LOS ANGELES O.J. Simpson launched a Twitter ...,tv,tvnews,OJ Simpson on Twitter: 'I got a little gettin'...,{}
4,4,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N19241,15 year old cori gauff beats Venus Williams at...,[Nike faces backlash after pulling 'Betsy Ross...,"WIMBLEDON, England (AP) Coco Gauff grew up a...",sports,tennis,"Gauff, just 15, shocks 5-time champ Venus, 39,...",{'Venus': 'Venus Williams'}


In [5]:
len(df)

20600

In [6]:
# df["News body"][0]
# entity_dict = ast.literal_eval(df["Title entity"][0])
# entity_str = ', '.join([f"{key} -> {value}" for key, value in entity_dict.items()])
# df["Category"][0]
# df["Topic"][0]
# selected_context = random.sample(df['context'][0], 20)
# context_body = ', '.join(selected_context)
# context_body

In [7]:
prompt = """Role: News Headline Generator
Task: Generate a compelling news headline using the given details.
Please generate only one short, concise and accurate news headline that captures the essence of the news and engages the audience.

News Body: 
{newsbody}

You have these information below to help you.
Category: {category}
Topic: {topic}
Entities: {entity_str}
News Context: {context_body}

Headline: """

Task: Edit this prompt to ensure a better accuracy in our results

In [8]:
prompt = """Role: News Headline Generator
Task: Generate a compelling news headline using the given details.
Please generate only one short, concise and accurate news headline that captures the essence of the news and engages the audience.

News Body: 
{newsbody}

You have these information below to help you.
Category: {category}
Topic: {topic}
Entities: {entity_str}
News Context: {context_body}

Headline: """

In [9]:
def prepare_prompt(row, prompt=prompt):
    try:
        # Evaluating the string representation of the dictionary to an actual dictionary
        entity_dict = ast.literal_eval(row["Title entity"])
        entity_str = ', '.join([f"{key} -> {value}" for key, value in entity_dict.items()])
        
        # Randomly sample context if it's a list and has enough elements to sample from
        if isinstance(row['context'], list) and len(row['context']) >= 5:
            selected_context = random.sample(row['context'], 5)
        else:
            selected_context = row['context'][:5]  # Just take the first 20 if not enough to sample from
        
        context_body = ', '.join(selected_context)
        prompt = prompt.format(
            category=row['Category'],
            topic=row['Topic'],
            entity_str=entity_str, 
            context_body=context_body, 
            newsbody=row['News body']
            )


        return prompt
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

# Applying the function to each row in the DataFrame and storing the results
# df['prompt'] = df.apply(prepare_prompt, axis=1)

In [10]:
start, end = 0, 5
data = []
headline_list = []
ground_truth = []
for index, row in df[start:end].iterrows():
    input = prepare_prompt(row)
    data.append(input)
    headline_list.append(llm.invoke({input}))
    ground_truth.append(row["Headline"])

In [11]:
# df["Headline"][start:end]

In [12]:
# headline = llm.invoke({data[0]}) #llama3.2
# # headline = llm.invoke({data[0]}).split('</think>')[-1].strip() #deepseek
# print(headline)

In [13]:
# headline_pred = llm.invoke({f"Generate one news headline for me. News Body: {df['News body'][99]}\nHeadline: "}) #llama3.2
# # headline_pred = llm.invoke({f"Generate one news headline for me. News Body: {df['News body'][99]}\nHeadline: "}).split('</think>')[-1].strip() #deepseek
# print(headline_pred)

In [14]:
headline_list #predicted by llama3.2

['"Trump Administration\'s Emissions Rule Sparks Precedent-Setting Legal Battle with Environmental Groups and State Attorneys General Over Clean Air Act Compliance"',
 '"Renovating the Heart of Your Home: How to Update Outdated Kitchen Countertops Without Breaking the Bank"',
 '"Verlander\'s Tough Talk Backfires as MLB Scolds Him for \'Juiced Ball\' Claims"',
 '"O.J. Simpson Launches Twitter Account, Reveals \'Little Gettin\' Even to Do\'"',
 '"Coco Gauff Shocks Wimbledon by Defeating Venus Williams at 15"']

In [15]:
ground_truth #Found in dataset

['High-stakes legal fight looms over Trump pollution rule',
 'The One Thing That Immediately Makes Your House Look Dated',
 "Justin Verlander got 'chewed out' by MLB before All-Star Game",
 "OJ Simpson on Twitter: 'I got a little gettin' even to do'",
 'Gauff, just 15, shocks 5-time champ Venus, 39, at Wimbledon']

In [16]:
bleu_score, meteor_score, average_f_measure = [], [], []
for index, output in enumerate(headline_list, start=start):
    bleu, meteor, rouge, rouge_f_measure = evaluate_headline_performance(df, index, output)
    bleu_score.append(bleu)
    meteor_score.append(meteor)
    average_f_measure.append(rouge_f_measure)

In [17]:
# Print all scores
print("BLEU Score:", sum(bleu_score)/len(data))
print("METEOR Score:", sum(meteor_score)/len(data))
# print("ROUGE Scores:", rouge_scores)
print("Average ROUGE F-measure:", sum(average_f_measure)/len(data))


BLEU Score: 0.2663912760890743
METEOR Score: 0.2267776447750809
Average ROUGE F-measure: 0.23405375986021143


In [18]:
bleu_score

[0.0,
 0.2187405715612322,
 0.30289764018096393,
 0.3639945549178426,
 0.4463236137853328]

In [19]:
meteor_score

[0.21052631578947367,
 0.09174311926605504,
 0.13043478260869565,
 0.48379270186335405,
 0.21739130434782605]

In [20]:
average_f_measure

[0.1290322580645161,
 0.10256410256410257,
 0.11111111111111112,
 0.51010101010101,
 0.31746031746031744]

In [21]:
# # COMET (model download and load)
# comet_model = download_model("wmt20-comet-da")  # Download a COMET model, this requires internet
# comet = load_from_checkpoint(comet_model)
# comet_score = comet.predict([{"src": "source text if applicable", "mt": generated_headline, "ref": ground_truth}])
# print("COMET Score:", comet_score['predicted_score'][0])  # Accessing the predicted score from the result