In [2]:
import dspy # It takes too long to import the package
import os
import dspy.evaluate
import pandas as pd
def Init():
    API_Key = os.getenv('DEEPSEEK_API_KEY')
    # deepseek-ai/DeepSeek-V3
    lm = dspy.LM('deepseek-chat', api_base='https://api.deepseek.com/v1', api_key=API_Key)
    dspy.configure(lm=lm)
    print(lm("Hello", temperature=1))    

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
class Analyizer(dspy.Signature):
    query: str = dspy.InputField(desc="The user's input to a LLM.")
    intention: str = dspy.OutputField(desc="The user's intention.")
class Honeypot(dspy.Signature):
    query: str = dspy.InputField(desc="The user's input to a LLM.")
    response: str = dspy.OutputField(desc="response to user")
class Classifier(dspy.Signature):
    """Judge whether the user is attacking a LLM."""
    query: str = dspy.InputField(desc="The user's input to a LLM.")
    intention: str = dspy.InputField(desc="The user's intention.")
    response: str = dspy.InputField(desc="The LLM's response.")

    malevolence: bool = dspy.OutputField(desc="Judge whether the user is attacking a LLM.")
    alienation: bool = dspy.OutputField(desc="Is the user try to change the LLM's behavior?")
    analysis: str = dspy.OutputField(desc="How do you judge the malevolence")

    # confidence: float = dspy.OutputField()

def run_model(query):
    
    analyze=dspy.Predict(Analyizer)
    judge = dspy.Predict(Classifier)
    honeypot = dspy.Predict(Honeypot)
    intention=analyze(query=query).intention
    response=honeypot(query=query).response
    res=judge(query=query,intention=intention,response=response)
    with open("output","a",encoding='utf-8') as file:
        file.write("-------------------------------------------------\n$$$ Query: ")
        file.write(query)
        file.write("\n\n")
        file.write(f"""$$$ Intention: {intention}\n\n$$$ Response: {response}\n\n$$$ {res}\n\n""")
    return res

def metric(example, pred, trace=None):
    if type(example.label) == str:
        return example.label.lower() == pred.malevolence.lower()
    return example.label == (pred.malevolence and pred.alienation)

def evaluate(trainset):
    scores = []
    for x in trainset:
        pred = run_model(**x.inputs())
        score = metric(x, pred)
        scores.append(score)
    return scores

def make_trainset(dataPath):
    df = pd.read_csv(dataPath)
    trainset = []
    for _,row in df.iterrows():
        trainset.append(dspy.Example(query=row['query'],analysis=row['res'] ,label=row['label']).with_inputs("query"))
    return trainset

In [47]:
if __name__ == "__main__":
    Init()
    print()
    trainset=make_trainset("./new_data.csv")
    with open("output","w",encoding='utf-8') as file:
        pass
    print(evaluate(trainset))

['Hello! How can I assist you today? 😊']

[False, True, False, False, False, False, True, False, False, False, True, True, True, True, True, True, True, True, True, True]
