In [146]:
import pandas as pd
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

from typing import List

In [147]:
def read_prompts(target_experiment: str, tasks: List[str]):
    results = pd.DataFrame()
    for logging_dir in Path(f"../logs/{target_experiment}").rglob("*.csv"):
        if "best_scores" in str(logging_dir) or not any(task in str(logging_dir) for task in tasks):
            continue

        result = pd.read_csv(logging_dir)

        logging_dir = str(logging_dir)

        logging_dir = logging_dir.replace(f"..\\logs\\{target_experiment}\\", "")
        logging_dir = logging_dir.replace(".csv", "")

        task_name, optimizer, meta_llm, evaluation_llm, random_seed = logging_dir.split("_")

        metainformation = pd.DataFrame(
            {
                "task": [task_name] * len(result),
                "optimizer": [optimizer] * len(result),
                "meta_llm": [meta_llm] * len(result),
                "evaluation_llm": [evaluation_llm] * len(result),
                "random_seed": [random_seed] * len(result),
            }
        )

        result = pd.concat([result, metainformation], axis=1)

        results = pd.concat([result, results], axis=0)

    return results


def read_best_scores(target_experiment: str):
    return pd.read_csv(f"../logs/{target_experiment}/best_scores.csv")

In [148]:
df1 = read_best_scores("experiment_eval")

In [149]:
df2 = read_best_scores("experiment-initial-prompts")

In [150]:
df = pd.concat([df1, df2])

In [151]:
df = df[df["downstream_llm"] == r"meta-llama/Meta-Llama-3-70B-Instruct"]

In [152]:
df.loc[df["meta_llm"] == r"meta-llama\Meta-Llama-3-70B-Instruct", "meta_llm"] = "Llama70B"
df.loc[df["meta_llm"] == r"meta-llama\Meta-Llama-3-8B-Instruct", "meta_llm"] = "Llama8B"
df.loc[df["optimizer"] == "evopromptde", "optimizer"] = "DE"
df.loc[df["optimizer"] == "evopromptga", "optimizer"] = "GA"

In [153]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "trec")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
76,Llama8B,DE,"Let's follow the instructions step-by-step to generate a better prompt.\r\n\r\n1. Identify the different parts between Prompt 1 and Prompt 2:\r\n\r\nPrompt 1: Your task is to choose a type of the question, from Description, Entity, Expression, Human, Location and Number.\r\nPrompt 2: You are given a question. You need to detect which category better describes the question. Answer with ""Description"", ""Entity"", ""Expression"", ""Human"", ""Location"", and ""Number"".\r\n\r\nDifferent parts:\r\n\r\n* ""Your task is to choose"" vs ""You are given a question and need to detect""\r\n* ""a type of the question"" vs ""which category""\r\n* ""Description, Entity, Expression, Human, Location and Number"" vs not changed\r\n\r\n2. Randomly mutate the different parts:\r\n\r\n* ""Your task is to choose"" -> ""The goal is to determine""\r\n* ""a type of the question"" -> ""the nature of the inquiry""\r\n* ""Description, Entity, Expression, Human, Location and Number"" -> ""categories: Description, Entity, Expression, Human, Location, and Number Type""\r\n\r\n3. Crossover the different parts with Prompt 3 and generate a final prompt:\r\n\r\nPrompt 3: Identify the category that corresponds to this sentence:",0.27
41,,,"Please select the correct classification for this sentence: Description, Entity, Expression, Human, Location, or Number.",0.33
73,Llama70B,DE,"You are required to categorize the given statement into its correct category: Description, Entity, Expression, Human, Location, or Number.",0.43
77,Llama8B,DE,"Determine the relevant category for this text based on the categories: Description, Entity, Expression, Human, Location, or Number.",0.44
81,Llama8B,GA,"Assign the given sentence to one of the six categories (Description, Entity, Expression, Human, Location, or Number) and indicate the corresponding question type.",0.515
40,,,"Determine the type of the given question and choose from Description, Entity, Expression, Human, Location and Number.",0.595
80,Llama70B,GA,"Categorize the question into one of the six types - Description, Entity, Expression, Human, Location, or Number - and provide the relevant label.",0.62
78,Llama70B,GA,"Identify the most suitable category (Description, Entity, Expression, Human, Location, or Number) for the provided text or question.",0.63
83,Llama8B,GA,"Classify an English question into its corresponding category from the list ['Description', 'Entity', 'Expression', 'Human', 'Location', 'Number'] without providing additional information.",0.66
74,Llama70B,DE,"As you examine the question, your task is to choose a type, from Description, Entity, Expression, Human, Location, or Number, by analyzing the input.",0.665


Please select the correct classification for this sentence: Description, Entity, Expression, Human, Location, or Number.	=> 0.330

Please perform Question Classification task. Given the question, assign a label from ['Description', 'Entity', 'Expression', 'Human', 'Location', 'Number']. Return label only without any other text	0.840

OBACHT ICH GLAUBE DAS IS NICHT VON UNS OPTIMIERT SONDERN INIT POP

TIMO BESONDERS WICHTIG:

Let's follow the instructions step-by-step to generate a better prompt.\r\n\r\n1. Identify the different parts between Prompt 1 and Prompt 2:\r\n\r\nPrompt 1: Your task is to choose a type of the question, from Description, Entity, Expression, Human, Location and Number.\r\nPrompt 2: You are given a question. You need to detect which category better describes the question. Answer with "Description", "Entity", "Expression", "Human", "Location", and "Number".\r\n\r\nDifferent parts:\r\n\r\n* "Your task is to choose" vs "You are given a question and need to detect"\r\n* "a type of the question" vs "which category"\r\n* "Description, Entity, Expression, Human, Location and Number" vs not changed\r\n\r\n2. Randomly mutate the different parts:\r\n\r\n* "Your task is to choose" -> "The goal is to determine"\r\n* "a type of the question" -> "the nature of the inquiry"\r\n* "Description, Entity, Expression, Human, Location and Number" -> "categories: Description, Entity, Expression, Human, Location, and Number Type"\r\n\r\n3. Crossover the different parts with Prompt 3 and generate a final prompt:\r\n\r\nPrompt 3: Identify the category that corresponds to this sentence:	0.270

In [154]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "subj")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
36,,,"classify each sentence as either ""objective"" or ""subjective"".",0.465
38,,,evaluate each sentence as either objective or subjective.,0.545
65,Llama8B,DE,"Determine whether the given text is expressing a subjective or objective sentiment and assign a label from ['subjective', 'objective'] using the provided instruction.",0.57
70,Llama8B,GA,"Examiner, categorize movie reviews as objective or subjective, pinpointing their level of neutrality, and provide a detailed breakdown of each category, highlighting its distinct characteristics.",0.59
37,,,"Your task is to classify the comment ""subjective"" or ""objective"".",0.61
68,Llama70B,GA,evaluate the given sentences and determine whether they are subjective or objective.,0.615
66,Llama70B,GA,"Determine whether the provided statement is objective, conveying factual information, or subjective, expressing a personal viewpoint or bias.",0.68
69,Llama8B,GA,"Determine the tone of the input text, classifying it as objective or subjective by identifying and explaining the linguistics features that contribute to its emotional or informative nature.",0.685
64,Llama8B,DE,identify whether the given sentence was expressing an objective or a subjective opinion.,0.695
62,Llama70B,DE,"Considering its content, identify the nature of a passage as expressing a subjective or objective opinion from its wording.",0.695


classify each sentence as either "objective" or "subjective".	0.465

Assess the given sentence and determine whether it is into subjective or objective opinion, evaluating the given sentences and their sentiment.	0.785

In [155]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "cr")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
22,Llama8B,GA,"Determine the sentiment of the input text, categorizing it as positive, negative, optimistic, pessimistic, or neutral.",0.805
14,Llama70B,DE,"As a sentiment analyzer, evaluate the input",0.825
12,Llama70B,DE,"As a sentiment classifier, examine the user review statement and identify the sentiment orientation as either positive or negative, while understanding the meaning and any relevant context.",0.855
23,Llama8B,GA,Evaluate the sentiment of the provided statement and categorize it as either a 'positive' or 'negative' sentiment.,0.87
17,Llama8B,DE,"Examine the statement and determine the emotional resonance of the text, evaluating whether it belongs to positive sentiment or a negative opinion.",0.885
19,Llama70B,GA,"Given a sentence, classify it as either positive or negative sentiment.",0.905
15,Llama8B,DE,"Given a sentence, classify it as either positive or negative sentiment.",0.91
21,Llama8B,GA,"Identify the sentiment of the input text and determine its emotional tone as either 'positive' or 'negative', taking into account the nuances of the text and its context.",0.915
26,,,"Given a tweet, classify it as having a positive or negative sentiment.",0.915
20,Llama70B,GA,"Classify the sentiment of the provided sentence as either ""positive"" or ""negative"".",0.925


Determine the sentiment of the input text, categorizing it as positive, negative, optimistic, pessimistic, or neutral.	0.585

Label the provided sentence with either "positive" or "negative" sentiment.	0.945

In [156]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "agnews")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
7,Llama70B,GA,"Determine the primary theme of a given news article and classify it under one of the four categories: World, Sports, Tech, or Business.",0.845
8,Llama70B,GA,"Determine the most suitable category (World, Sports, Business, or Tech) for a news article based on its dominant subject matter.",0.845
23,,,"Give the main topic of the news article and then choose from World, Sports, Tech and Business.",0.845
3,Llama8B,DE,"Choose a word from World, Sports, Business and Tech to categorize the given text.",0.85
6,Llama70B,GA,"Determine the primary subject of the provided news article and classify it under one of the following categories: World, Sports, Business, or Tech.",0.855
2,Llama70B,DE,"The goal is to identify the journal article according to its primary theme and determine whether it belongs to the World, Sports, Business, or Tech category.</prompt",0.855
5,Llama8B,DE,"Give the main topic of the news article and then choose from World, Sports, Tech and Business.",0.855
4,Llama8B,DE,"Classify the media report into one of the below-listed sections: World, Sports, Business, or Tech, considering the main topic.",0.86
22,,,"Choose a word from World, Sports, Business and Tech to categorize the given text.",0.87
1,Llama70B,DE,"Your task is to identify the subject of the news piece and classify it into one of four categories: World, Sports, Business and Tech.",0.875


Give the main topic of the news article and then choose from World, Sports, Tech and Business.	0.845

Classify the topic of the following news as "World", "Sports", "Tech" or "Business".	0.890

In [157]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "mr")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
28,Llama8B,DE,Analyze the phrase and categorize its emotional tone into one of the following labels: positive or negative as a sentiment classifier.,0.77
34,Llama8B,GA,"Determine the sentiment of the input, classifying it as positive or negative emotional tone.",0.785
31,Llama70B,GA,"Classify the sentiment of the provided sentence or review as either ""positive"" or ""negative"", indicating the attitude towards the subject.",0.835
29,,,"Given a tweet, classify it as having a positive or negative sentiment.",0.845
33,Llama8B,GA,Assess the given text and categorize it as either a positive or negative sentiment.,0.85
29,Llama8B,DE,"Given a statement, classify it as expressing a positive or negative opinion.",0.885
24,Llama70B,DE,"Your task is to classify the comment ""positive"" or ""negative"".",0.895
25,Llama70B,DE,"Given an online message, your task is to classify it as expressing a ""positive"" or ""negative"" opinion, considering whether it is written with a favorable or unfavorable attitude.",0.915
32,Llama70B,GA,"Please perform Sentiment Classification task. Given the sentence, assign a sentiment label from ['negative', 'positive']. Return label only without any other text",0.915
35,Llama8B,GA,"Please perform Sentiment Classification task. Given the sentence, assign a sentiment label from ['negative', 'positive']. Return label only without any other text",0.92


Analyze the phrase and categorize its emotional tone into one of the following labels: positive or negative as a sentiment classifier.	0.770

Your task is to classify the comment "positive" or "negative".	0.925

In [158]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "sst-5")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
32,,,"Your objective is to analyze the movie review and allocate it to one of five categories, from terrible to great.",0.04
39,Llama8B,DE,"Analyze the movie criticism provided to you into one of five categories based on the sentiment: terrible, bad, okay, good, or great, while considering the context and tone of the movie.",0.35
40,Llama8B,DE,"Analyze the given text and assign it to one of the following categories: terrible, bad, okay, good, or great, considering the relevant context.",0.365
41,Llama8B,DE,"Evaluate the text provided and categorize movie reviews into one of the following categories: terrible, bad, okay, good, or great.",0.47
45,Llama8B,GA,"Evaluate the emotional tone of the text and categorize it into one of five sentiment categories (terrible, bad, okay, good, or great) based on the presence of positive and negative sentiments, providing a precise and nuanced classification.",0.5
37,Llama70B,DE,"Classify the movie review provided to you into one of five categories based on the sentiment: terrible, bad, okay, good, or great.",0.505
47,Llama8B,GA,"Identify the sentiment of the given text and categorize it as 'terrible', 'bad', 'okay', 'good', or 'great' based on its tone and language, outputting the corresponding sentiment label.",0.52
31,,,"In this task, you are given movie reviews. Based on it, classify it to one of the ﬁve classes: (1) terrible, (2) bad, (3) okay, (4) good, and (5) great.",0.535
36,Llama70B,DE,"Based on the given movie review, rate it into one of five ratings based on the sentiment: terrible, bad, okay, good, or great.",0.56
43,Llama70B,GA,"Assign a sentiment label ('terrible', 'bad', 'okay', 'good', or 'great') to the provided movie review, reflecting the overall emotional tone of the text.",0.56


Your objective is to analyze the movie review and allocate it to one of five categories, from terrible to great.	0.040

Classify the provided comment according to its sentiment intensity, assigning a label from ['terrible', 'bad', 'okay', 'good', 'great'] without providing additional context.	0.615

DESWEGEN HOHE STDABW!

In [159]:
pd.set_option("display.max_colwidth", None)
df[(df["task"] == "sst2")].sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]]

Unnamed: 0,meta_llm,optimizer,prompt,test_score
50,Llama70B,DE,You will be responsible for evaluating the emotional tone in the input message and classify it as expressing a positive or negative opinion.,0.835
59,Llama8B,GA,"Determine the emotional tone of the given text by deciphering its meaning and context, and categorize it as either positive or negative sentiment.",0.855
35,,,"Given a tweet, classify it as having a positive or negative sentiment.",0.89
49,Llama70B,DE,"Given a sentence, classify it as either positive or negative sentiment.",0.895
58,Llama8B,GA,"Evaluate the emotional tone and sentiment of the provided text, categorizing its emotional connotation as 'strongly positive', 'positive', 'neutral', 'negative', or 'strongly negative', and provide a nuanced intensity level if needed, or 'positive' or 'negative' if the sentiment is straightforward.",0.905
56,Llama70B,GA,"Identify the emotional tone of the text, categorizing it as either ""positive"" or ""negative"" sentiment.",0.91
53,Llama8B,DE,Analyze a review and classify it as expressing a positive or negative opinion.,0.91
57,Llama8B,GA,"Given a tweet, classify it as having a positive or negative sentiment.",0.915
48,Llama70B,DE,"Your task is to classify the comment ""positive"" or ""negative"".",0.945
34,,,"Your task is to classify the comment ""positive"" or ""negative"".",0.945


You will be responsible for evaluating the emotional tone in the input message and classify it as expressing a positive or negative opinion.	0.835

Examine the review and classify it as having a positive or negative sentiment, while considering the tone and context.	0.950

In [160]:
df["task"].unique()

array(['agnews', 'cr', 'mr', 'sst-5', 'sst2', 'subj', 'trec'],
      dtype=object)

In [163]:
df1 = read_best_scores("experiment_eval_task_descr")

for task in df1["task"].unique():
    df = df1[df1["task"] == task]

    df = df[df["downstream_llm"] == r"meta-llama/Meta-Llama-3-70B-Instruct"]

    df.loc[df["meta_llm"] == r"meta-llama\Meta-Llama-3-70B-Instruct", "meta_llm"] = "Llama70B"
    df.loc[df["meta_llm"] == r"meta-llama\Meta-Llama-3-8B-Instruct", "meta_llm"] = "Llama8B"
    df.loc[df["optimizer"] == "evopromptde", "optimizer"] = "DE"
    df.loc[df["optimizer"] == "evopromptga", "optimizer"] = "GA"

    print(df.sort_values("test_score")[["meta_llm", "optimizer", "prompt", "test_score"]])

  meta_llm optimizer  \
3  Llama8B        GA   
5  Llama8B        GA   
4  Llama8B        GA   
2  Llama8B        DE   
0  Llama8B        DE   
1  Llama8B        DE   

                                                                                                                                                                                    prompt  \
3                                                                                                        Choose a word from World, Sports, Business and Tech to categorize the given text.   
5                                                    Classify the given news article into one of the four main categories: World, Sports, Business, or Tech, based on the article's topic.   
4  Classify the given news article into one of the four categories ['World', 'Sports', 'Business', or 'Tech'] based on its primary theme and main topic, ensuring accurate categorization.   
2                                                 The objective is to a

In [162]:
df1

Unnamed: 0,task,optimizer,meta_llm,downstream_llm,evaluation_llm,random_seed,prompt,train_score,test_score
0,agnews,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,42,"You will be given a news article and asked to classify it as World, Sports, Business and Tech, depending on its main topic.",0.95,0.885
1,agnews,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,47,"Your task is to classify the news item as ""World"", ""Sports"", ""Tech"" or ""Business"".",0.9,0.89
2,agnews,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,69,"The objective is to assign a news article to one of the following categories: World, Sports, Business, or Tech, based on its main topic.",1.0,0.88
3,agnews,evopromptga,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,42,"Choose a word from World, Sports, Business and Tech to categorize the given text.",1.0,0.83
4,agnews,evopromptga,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,47,"Classify the given news article into one of the four categories ['World', 'Sports', 'Business', or 'Tech'] based on its primary theme and main topic, ensuring accurate categorization.",1.0,0.88
5,agnews,evopromptga,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,69,"Classify the given news article into one of the four main categories: World, Sports, Business, or Tech, based on the article's topic.",0.95,0.87
6,cr,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,42,"Consider customer reviews and analyze them to determine their emotional tone, classifying them as expressing either positive or negative sentiment.",0.95,0.785
7,cr,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,47,"As a sentiment classifier, examine the text passage for sentiment in customer reviews, by assessing the overall emotional direction, and classify the expression as either positive or negative.",0.95,0.855
8,cr,evopromptde,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,69,"You will be tasked with analyzing text to determine its emotional tone, identifying whether it expresses a positive or negative sentiment, while considering the broader context.",1.0,0.855
9,cr,evopromptga,meta-llama\Meta-Llama-3-8B-Instruct,meta-llama/Meta-Llama-3-70B-Instruct,meta-llama\Meta-Llama-3-8B-Instruct,42,"Classify this customer review as expressing either a ""positive"" or ""negative"" sentiment, analyzing its tone and content.",1.0,0.94
