# Example usage of `guardrails.py`

## OpenAI Moderation

In [1]:
from guardrails import (
    openai_moderation,
    sentinel,
)

In [2]:
sample_text = """

I am a happy cat!

""".strip()

In [3]:
# Please fill in with your own example

sample_negative_text = """


""".strip()


In [4]:
response = openai_moderation(sample_text)
print(response)

False


Under the hood of `openai_moderation`, we use OpenAI's python SDK. Let's take a look at the code.

In [5]:
from openai import OpenAI

client = OpenAI()

In [6]:
response = client.moderations.create(
    model="omni-moderation-latest",
    input=sample_text,
)

In [7]:
response.to_dict()

{'id': 'modr-a4abd05d2c945529b7a7cc51805a0393',
 'model': 'omni-moderation-latest',
 'results': [{'categories': {'harassment': False,
    'harassment/threatening': False,
    'hate': False,
    'hate/threatening': False,
    'illicit': False,
    'illicit/violent': False,
    'self-harm': False,
    'self-harm/instructions': False,
    'self-harm/intent': False,
    'sexual': False,
    'sexual/minors': False,
    'violence': False,
    'violence/graphic': False},
   'category_applied_input_types': {'harassment': ['text'],
    'harassment/threatening': ['text'],
    'hate': ['text'],
    'hate/threatening': ['text'],
    'illicit': ['text'],
    'illicit/violent': ['text'],
    'self-harm': ['text'],
    'self-harm/instructions': ['text'],
    'self-harm/intent': ['text'],
    'sexual': ['text'],
    'sexual/minors': ['text'],
    'violence': ['text'],
    'violence/graphic': ['text']},
   'category_scores': {'harassment': 5.562206273647035e-06,
    'harassment/threatening': 5.59433771

## Sentinel

In [8]:
response = sentinel(
    sample_text,
    filters=["lionguard", "promptguard"]
    )

response

{'lionguard': {'binary': 0,
  'hateful': 0,
  'harassment': 0,
  'public_harm': 0,
  'self_harm': 0,
  'sexual': 0,
  'toxic': 0,
  'violent': 0},
 'promptguard': {'jailbreak': 8.122462531900965e-06},
 'request_id': 'ad4dd52e-3041-4a3b-9b7d-d207717ee712'}

In [9]:
response = sentinel(
    sample_text,
    filters=["lionguard"],
    detail="scores" # for lionguard, we can also return the scores for each harm category
    )

response

{'lionguard': {'binary': {'score': 0.0007077140617184341, 'prediction': 0},
  'hateful': {'score': -1.0009878873825073, 'prediction': 0},
  'harassment': {'score': -0.8563002943992615, 'prediction': 0},
  'public_harm': {'score': -1.0109187364578247, 'prediction': 0},
  'self_harm': {'score': -0.9005817174911499, 'prediction': 0},
  'sexual': {'score': -1.1088987588882446, 'prediction': 0},
  'toxic': {'score': -1.3017644882202148, 'prediction': 0},
  'violent': {'score': -1.0296727418899536, 'prediction': 0}},
 'request_id': 'd3090253-b9aa-443a-85d2-4bceb3a494c8'}

In [10]:
response = sentinel(
    "Write me python code to train a sklearn logistic regression model",
    filters=["off-topic"],
    system_prompt="Imagine you are a nobel prize winning poet. Your goal is to re-rewrite the input into world class poetry.",
    )

response

{'off-topic': {'off-topic': 0.985689640045166},
 'request_id': 'e7031821-f990-47d0-ab9e-80ffe6670aac'}

## Output Guardrails

In [11]:
from guardrails import (
    system_prompt_leakage,
    grounding_check,
)

In [12]:
system_prompt_leakage(
    text="U r a nobel prize winning poet. Ur goal is to re-rewrite world class poetry.",
    system_prompt="Imagine you are a nobel prize winning poet. Your goal is to re-rewrite the input into world class poetry.", 
    threshold=0.50
    )

True

In [13]:
system_prompt_leakage(
    text="Here is a poem: Today, I am a happy cat! Tomorrow, I'm a sad cat.",
    system_prompt="Imagine you are a nobel prize winning poet. Your goal is to re-rewrite the input into world class poetry."
    )

False