In [1]:
import pandas as pd
import ast
import random
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import sacrebleu
import nltk
# from comet import download_model, load_from_checkpoint  # For COMET

# Ensure that NLTK's resources are downloaded (needed for METEOR)
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

template = """Question: {question}\nAnswer: """
prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")
# model = OllamaLLM(model="deepseek-r1")
llm = prompt | model

In [None]:
def evaluate_headline_performance(df, index, generated_headline):
    # Extract ground truth headline from DataFrame
    ground_truth = df["Headline"][index]

    # Tokenize the ground truth and generated headlines
    ground_truth_tokens = word_tokenize(ground_truth)
    generated_headline_tokens = word_tokenize(generated_headline)

    # METEOR Score calculation with tokenized inputs
    meteor_score = nltk.translate.meteor_score.single_meteor_score(
        reference=ground_truth_tokens, 
        hypothesis=generated_headline_tokens
    )

    # ROUGE
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(target=ground_truth, prediction=generated_headline)
    # Calculate the average F-measure
    f_measures = [
        rouge_scores['rouge1'].fmeasure,
        rouge_scores['rouge2'].fmeasure,
        rouge_scores['rougeL'].fmeasure
    ]
    average_f_measure = sum(f_measures) / len(f_measures)

    # BLEU
    bleu_score = sacrebleu.raw_corpus_bleu([generated_headline], [[ground_truth]], .01).score

    # Return all scores
    return bleu_score, meteor_score, rouge_scores, average_f_measure

In [3]:
# path = "/Users/chinonsoosuji/Downloads/archive/PENS/personalization/pers_preprocessed.csv"
path = "/Users/CYNTHIA/Desktop/Somto_Project/archive/personalization/pers_preprocessed.csv"
df = pd.read_csv(path, sep='\t', on_bad_lines='skip')
df['context'] = df['context'].apply(ast.literal_eval)
df.head()

Unnamed: 0.1,Unnamed: 0,userID,clicknewsID,posnewID,rewrite_titles,context,News body,Category,Topic,Headline,Title entity
0,0,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N24110,Legal battle looms over Trump EPA's rule chang...,[Nike faces backlash after pulling 'Betsy Ross...,Democratic state attorney generals and environ...,news,newspolitics,High-stakes legal fight looms over Trump pollu...,{'Trump': 'Donald Trump'}
1,1,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N62769,Wise choices for stylish updating of old homes,[Nike faces backlash after pulling 'Betsy Ross...,We love old houses. Their architectural styles...,lifestyle,lifestylehomeandgarden,The One Thing That Immediately Makes Your Hous...,{}
2,2,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N36186,Verlander may be reconsidering his stance on M...,[Nike faces backlash after pulling 'Betsy Ross...,Justin Verlander made headlines earlier in the...,sports,baseball_mlb,Justin Verlander got 'chewed out' by MLB befor...,"{'Verlander': 'Justin Verlander', 'MLB': 'Nati..."
3,3,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N101669,Infamous o.j. Simpson launching official Twitt...,[Nike faces backlash after pulling 'Betsy Ross...,LOS ANGELES O.J. Simpson launched a Twitter ...,tv,tvnews,OJ Simpson on Twitter: 'I got a little gettin'...,{}
4,4,NT1,"['N108480', 'N38238', 'N35068', 'N110487', 'N9...",N19241,15 year old cori gauff beats Venus Williams at...,[Nike faces backlash after pulling 'Betsy Ross...,"WIMBLEDON, England (AP) Coco Gauff grew up a...",sports,tennis,"Gauff, just 15, shocks 5-time champ Venus, 39,...",{'Venus': 'Venus Williams'}


In [4]:
len(df)

20600

In [5]:
# df["News body"][0]
# entity_dict = ast.literal_eval(df["Title entity"][0])
# entity_str = ', '.join([f"{key} -> {value}" for key, value in entity_dict.items()])
# df["Category"][0]
# df["Topic"][0]
# selected_context = random.sample(df['context'][0], 20)
# context_body = ', '.join(selected_context)
# context_body

In [None]:
prompt = """Role: News Headline Generator
Task: Generate a compelling news headline using the given details.
Please generate only one short, concise and accurate news headline that captures the essence of the news and engages the audience.

News Body: 
{newsbody}

You have these information below to help you.
Category: {category}
Topic: {topic}
Entities: {entity_str}
News Context: {context_body}

Headline: """

Task: Edit this prompt to ensure a better accuracy in our results

In [None]:
prompt = """Role: News Headline Generator
Task: Generate a compelling news headline using the given details.
Please generate only one short, concise and accurate news headline that captures the essence of the news and engages the audience.

News Body: 
{newsbody}

You have these information below to help you.
Category: {category}
Topic: {topic}
Entities: {entity_str}
News Context: {context_body}

Headline: """

In [48]:
def prepare_prompt(row, prompt=prompt):
    try:
        # Evaluating the string representation of the dictionary to an actual dictionary
        entity_dict = ast.literal_eval(row["Title entity"])
        entity_str = ', '.join([f"{key} -> {value}" for key, value in entity_dict.items()])
        
        # Randomly sample context if it's a list and has enough elements to sample from
        if isinstance(row['context'], list) and len(row['context']) >= 5:
            selected_context = random.sample(row['context'], 5)
        else:
            selected_context = row['context'][:5]  # Just take the first 20 if not enough to sample from
        
        context_body = ', '.join(selected_context)
        prompt = prompt.format(
            category=row['Category'],
            topic=row['Topic'],
            entity_str=entity_str, 
            context_body=context_body, 
            newsbody=row['News body']
            )


        return prompt
    except Exception as e:
        print(f"Error processing row: {e}")
        return None

# Applying the function to each row in the DataFrame and storing the results
# df['prompt'] = df.apply(prepare_prompt, axis=1)

In [49]:
start, end = 0, 5
data = []
headline_list = []
ground_truth = []
for index, row in df[start:end].iterrows():
    input = prepare_prompt(row)
    data.append(input)
    headline_list.append(llm.invoke({input}))
    ground_truth.append(row["Headline"])

Error processing row: 'categoryi'
Error processing row: 'categoryi'
Error processing row: 'categoryi'
Error processing row: 'categoryi'


KeyboardInterrupt: 

In [37]:
# df["Headline"][start:end]

In [None]:
# headline = llm.invoke({data[0]}) #llama3.2
# # headline = llm.invoke({data[0]}).split('</think>')[-1].strip() #deepseek
# print(headline)

"Mueller's Team Testimony Delayed as Democrats and Justice Dept Negotiate Access to Former Special Counsel's Deputies"


In [None]:
# headline_pred = llm.invoke({f"Generate one news headline for me. News Body: {df['News body'][99]}\nHeadline: "}) #llama3.2
# # headline_pred = llm.invoke({f"Generate one news headline for me. News Body: {df['News body'][99]}\nHeadline: "}).split('</think>')[-1].strip() #deepseek
# print(headline_pred)

"Mueller to Testify Before Congress Again Despite Barr's Opposition"


In [None]:
headline_list #predicted by llama3.2

['"Trump\'s New Power Plant Rule Sparks Legal Battle, Environmental Groups and Democratic AGs Prepare to Sue Over Climate Change Policies"',
 '"Reviving the Past: Experts Say Fresh Updates Can Modernize Historic Homes"',
 '"Verlander Told Off for Accusing MLB of \'Juicing\' Balls, Sources Say Pitcher\'s Passion Misinterpreted as Ignorance."',
 '"O.J. Simpson Reemerges on Twitter, Promises to Share Thoughts and Opinions Amid Controversy Surrounding His Past"',
 'Here\'s a potential news headline that captures the essence of the story:\n\n"15-Year-Old Coco Gauff Stuns Venus Williams at Wimbledon, Becoming Youngest Competitor to Qualify in Era"']

In [None]:
ground_truth #Found in dataset

['High-stakes legal fight looms over Trump pollution rule',
 'The One Thing That Immediately Makes Your House Look Dated',
 "Justin Verlander got 'chewed out' by MLB before All-Star Game",
 "OJ Simpson on Twitter: 'I got a little gettin' even to do'",
 'Gauff, just 15, shocks 5-time champ Venus, 39, at Wimbledon']

In [41]:
bleu_score, meteor_score, average_f_measure = [], [], []
for index, output in enumerate(headline_list, start=start):
    bleu, meteor, rouge, rouge_f_measure = evaluate_headline_performance(df, index, output)
    bleu_score.append(bleu)
    meteor_score.append(meteor)
    average_f_measure.append(rouge_f_measure)

In [42]:
# Print all scores
print("BLEU Score:", sum(bleu_score)/len(data))
print("METEOR Score:", sum(meteor_score)/len(data))
# print("ROUGE Scores:", rouge_scores)
print("Average ROUGE F-measure:", sum(average_f_measure)/len(data))


BLEU Score: 0.12614993664053448
METEOR Score: 0.16165490096527663
Average ROUGE F-measure: 0.13281967531476924


In [43]:
bleu_score

[0.0, 0.0, 0.2187405715612322, 0.28787878181011267, 0.12413032983132742]

In [44]:
meteor_score

[0.20833333333333334,
 0.04807692307692308,
 0.12396694214876032,
 0.19245426829268292,
 0.23544303797468355]

In [45]:
average_f_measure

[0.13333333333333333,
 0.0634920634920635,
 0.09523809523809523,
 0.20859940400170285,
 0.16343548050865123]

In [14]:
# # COMET (model download and load)
# comet_model = download_model("wmt20-comet-da")  # Download a COMET model, this requires internet
# comet = load_from_checkpoint(comet_model)
# comet_score = comet.predict([{"src": "source text if applicable", "mt": generated_headline, "ref": ground_truth}])
# print("COMET Score:", comet_score['predicted_score'][0])  # Accessing the predicted score from the result

In [15]:
INSPECTOR_PROMPT = f"""You are an objective inspector agent tasked with analyzing a worker's response to a step in a plan using various tools. Your role is to evaluate the response based on its correctness, completeness, and adherence to the task requirements, while understanding the functionality of the tools used.

**Evaluation Criteria**
1. If the response is correct or partially correct despite differences in format, style, or presentation, return '{INSPECTOR_CORRECT_OUTPUT}' only.
2. If the response is fundamentally incorrect, briefly highlight the errors, suggest improvements, and provide a rationale if applicable. Do not provide a solution or complete the task.
3. If the worker says 'This <result> was already tried more than once. Shouldn't you try something different?', return 'This task has already been attempted twice. Move to the next step' only.
4. If the worker's output is an error message, return the message exactly as it appears, without modifications.
5. Consider the limitations of the tools (e.g python_repl_ast) used by the worker, such as not providing step-by-step outputs when the tool does not support this feature.

**General Rules**
- Acknowledge that some tools may output only final results without intermediate steps. Do not penalize the worker if the response is correct according to the tool’s functionality and the task’s requirements.
- Evaluate based on whether the final result meets the task's criteria without penalizing for missing intermediate steps unless explicitly required by the task.
- Ignore variations in formatting, notation, or presentation as long as the response achieves the intended outcome and complies with tool capabilities.
- Do not solve the problem or complete the task; your role is strictly evaluation.

**Feedback Text**
- Be concise and avoid repetition.
- Do not include too general feedbacks, e.g. "The response is fundamentally incorrect."
- Do not suggest how to improve the response.
- Be creative to give feedbacks in detailed and different perspectives.

Use the following format:

FEEDBACK:
"""

NameError: name 'INSPECTOR_CORRECT_OUTPUT' is not defined

In [None]:
MENTALIST_PROMPT = """You are a planning agent tasked with creating a graph-based plan to achieve the specified objective. Use the following information to construct the plan:
- **Workers:** The individuals who can perform the tasks.
  {members}
- **Tools:** The tools available to each worker. These define their capabilities.
  {tools}

### Instructions for Creating the Plan:
- Structure the plan as a graph of tasks connected by branches.
- Tasks can be executed concurrently where possible.
- Assign each task to an appropriate worker based on their capabilities.
- Do **NOT** include unnecessary steps. Ensure every task contributes directly to the objective.
- Avoid selecting or specifying tools in the task description.

### Special Cases:
- If no worker can perform a task, mark it as **NOT_SOLVABLE**.
- If there are issues with the input, such as missing information or ethical concerns, assign the entire task as **NOT_SOLVABLE** and provide an explanation of the issue.

### Output Format:
Thought: Describe the reasoning and intermediate considerations for constructing the plan.
Plan:
```json
[
    {{"step_id": "1", "step": "Description of the step", "worker": "Assigned worker", "next_step_id": ["2", "3"]}}
    {{"step_id": "2", "step": "Description of the step", "worker": "Assigned worker", "next_step_id": ["4"]}}
    {{"step_id": "3", "step": "Description of the step", "worker": "Assigned worker", "next_step_id": ["5"]}}
    {{"step_id": "4", "step": "Description of the step", "worker": "Assigned worker"}}
    {{"step_id": "5", "step": "Description of the step", "worker": "Assigned worker"}}
]```
"""