In [None]:
pip install -r requirements.txt

In [None]:
import PyPDF2
import faiss
import torch
from transformers import AutoTokenizer, AutoModel, pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
import requests

# Set up API for LLM query
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3"
HF_TOKEN = "paste_your_hf_access_key"  # Replace with your Hugging Face API token
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

# Load embedding model
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# Step 2: Split text into chunks
def split_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# Step 3: Store embeddings in FAISS
def build_faiss_index(chunks):
    dimension = 384  # Model embedding size
    index = faiss.IndexFlatL2(dimension)
    embeddings = [embed_model.encode(chunk) for chunk in chunks]
    embeddings = np.array(embeddings).astype('float32')
    index.add(embeddings)
    return index, embeddings, chunks

# Step 4: Retrieve relevant text based on query
def retrieve_relevant_chunks(query, index, chunks, embeddings, top_k=3):
    query_embedding = embed_model.encode(query).astype('float32')
    _, idxs = index.search(np.array([query_embedding]), top_k)
    return [chunks[i] for i in idxs[0]]

# Step 5: Query LLM with retrieved text
def query_llm(context, query):
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = requests.post(API_URL, headers=headers, json={"inputs": prompt, "parameters": {"max_new_tokens": 300, "temperature": 0.7}})
    return response.json()[0]['generated_text']

# Step 6: Extract rules from response and save to file
def extract_rules(response, filename="rules.txt"):
    write_flag = False
    rules = []

    for line in response.split("\n"):
        line = line.strip()

        if "Answer:" in line:
            write_flag = True
            continue  # Skip the "Answer:" line itself

        if write_flag and line and line[0].isdigit():
            rules.append(line)

    if rules:
        with open(filename, "w", encoding="utf-8") as file:
            file.write("\n".join(rules))
        print(f"✅ Rules saved to {filename}")
    else:
        print("❌ No rules found to save.")

# Main function to process PDF and ask questions
def process_pdf_and_ask(pdf_path, query):
    text = extract_text_from_pdf(pdf_path)
    chunks = split_text(text)
    index, embeddings, chunk_list = build_faiss_index(chunks)
    relevant_chunks = retrieve_relevant_chunks(query, index, chunk_list, embeddings)
    response = query_llm("\n".join(relevant_chunks), query)
    extract_rules(response)
    return response

# Example usage
pdf_path = "/content/FR_Y-14Q_Instructions_draft 2022.pdf"  # Replace with your PDF file path
question = "What are the rules that we can extract from the B.2—Securities 2 (“Investment Securities with Designated Accounting Hedges”)?"

response = process_pdf_and_ask(pdf_path, question)
print(response)


Expected Output

/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  warnings.warn(
modules.json: 100%
 349/349 [00:00<00:00, 26.4kB/s]
config_sentence_transformers.json: 100%
 116/116 [00:00<00:00, 8.45kB/s]
README.md: 100%
 10.5k/10.5k [00:00<00:00, 642kB/s]
sentence_bert_config.json: 100%
 53.0/53.0 [00:00<00:00, 4.46kB/s]
config.json: 100%
 612/612 [00:00<00:00, 36.2kB/s]
model.safetensors: 100%
 90.9M/90.9M [00:00<00:00, 184MB/s]
tokenizer_config.json: 100%
 350/350 [00:00<00:00, 21.5kB/s]
vocab.txt: 100%
 232k/232k [00:00<00:00, 1.40MB/s]
tokenizer.json: 100%
 466k/466k [00:00<00:00, 2.80MB/s]
special_tokens_map.json: 100%
 112/112 [00:00<00:00, 7.65kB/s]
config.json: 100%
 190/190 [00:00<00:00, 14.8kB/s]
✅ Rules saved to rules.txt
Context: 54 B.1—SECURITIES 1 (“M AIN SCHEDULE ”) ................................ ................................ ................................ ................................ .............. 54 B.2—SECURITIES 2 (“INVESTMENT SECURITIES WITH DESIGNATED ACCOUNTING HEDGES ”) ................................ ................................ .. 60 Schedule C —Regulatory Capital Instruments ................................ ................................ ................................ ............. 64 C.1—REGULATORY CAPITAL AND SUBORDINATED DEBT INSTRUMENTS AS OF QUARTER END ................................ ................................ 64 C.2—REGULATORY CAPITAL AND SUBORDINATED DEBT INSTRUMENT REPURCHASES /REDEMPTIONS DURING QUARTER ....................... 66 C.3 – REGULATORY CAPITAL AND SUBORDINATED DEBT INSTRUMENTS ISSUANCES DURING QUARTER ................................ ................... 68 Schedule D —Regulatory Capital ................................ ................................ ................................ ................................ .... 73 Schedule E —Operational Risk ................................ ................................ ................................ ................................ ........ 78 E.1—OPERATIONAL LOSS HISTORY ................................ ................................ ................................ ................................ ....................... 78 E.2. INTERNAL BUSINESS LINE ................................ ................................ ................................ ................................ .............................. 84 E.3. UNIT-OF-MEASURE (UOM) ................................ ................................ ................................ ................................ ........................... 85 E.4. THRESHOLD INFORMATION ................................ ................................ ................................ ................................ ............................ 86 E.5—LEGAL RESERVES FREQUENCY ................................ ................................ ................................ ................................ ....................... 87 Schedule F —Trading ................................ ................................ ................................ ................................ ........................ 90 GLOSSARY ................................ ................................ ................................ ................................ ................................ ............................. 93 REGIONAL GROUPINGS ................................ ................................ ................................ ................................ ................................ .......... 95 F.1—EQUITY BY GEOGRAPHY ................................ ................................ ................................ ................................ ............................... 97 F.2—EQUITY SPOT-VOL GRID ................................ ................................ ................................ ................................ .............................. 98 F.3—OTHER EQUITY ................................ ................................ ................................ ................................ ................................ ............ 99 F.4—FX SPOT SENSITIVITIES ................................ ................................ ................................ ................................ .............................. 100 F.5—FX VEGA ................................ ................................ ................................ ................................ ................................ .................... 101 F.6—RATES DV01 ................................ ................................ ................................ ................................ ................................ ............. 102 F.7—RATES VEGA ................................ ................................ ................................ ................................ ................................ .............. 105 F.8—OTHER RATES ................................ ................................ ................................ ................................ ................................ ............ 106 F.9—ENERGY ................................ ................................ ................................ ................................ ................................ ..................... 107 F.10—METALS ................................ ................................ ................................ ................................ ................................ .................. 108 F.11—AGS & SOFTS ................................ ................................ ................................ ................................ ................................ ........... 109 F.12—COMMODITY INDICES ................................ ................................ ................................ ................................ ............................... 110 F.13—COMMODITY SPOT-VOL GRIDS ................................ ................................ ................................ ................................ ................. 111 F.15—AGENCIES ................................ ................................ ................................ ................................ ................................ ................ 114 F.16—MUNIS ................................ ................................ ................................ ................................ ................................ ..................... 115 F.17—AUCTION RATE SECURITIES (ARS) ................................ ................................ ................................ ................................ .......... 116 F.18—CORPORATE CREDIT -ADVANCED ................................ ................................ ................................ ................................ .............. 117 F.19—CORPORATE CREDIT -EMERGING MARKETS ................................ ................................ ................................ ............................... 119 F.20—SOVEREIGN CREDIT ................................ ................................ ................................ ................................ ................................ . 121 F.21—CREDIT CORRELATION ................................ ................................ ................................ ................................ ............................. 123 F.22—IDR -CORPORATE CREDIT ................................ ................................ ................................ ................................ ........................ 125 F.23—IDR -JUMP TO DEFAULT ................................ ................................ ................................ ................................ ........................... 128 F.24—PRIVATE EQUITY ................................ ................................ ................................ ................................ ................................ ..... 129 F.25—OTHER FAIR VALUE ASSETS ................................ ................................ ................................ ................................ ..................... 130 Schedule G —PPNR ................................ ................................ ................................ ................................ .......................... 131 G.1—PPNR SUBMISSION WORKSHEET ................................ ................................ ................................ ................................ ............... 134 G.2—PPNR NET INTEREST INCOME (NII) WORKSHEE T ................................ ................................ ................................ ..................... 148 G.3—PPNR METRICS ................................ ................................ ................................ ................................ ................................ ........ 156 Schedule H —Wholesale Risk ................................ ................................ ................................ ................................ ........ 168 H.1 - CORPORATE LOAN DATA SCHEDU LE ................................ ................................ ................................ ................................ ........... 168 H.2 – COMMERCIAL REAL ESTATE SCHEDU LE ................................ ................................ ................................ ................................ ..... 227 H.3 – LINE OF BUSINESS SCHEDULE ................................ ................................ ................................ ................................ ..................... 263 H.4 – INTERNAL RISK RATING SCHEDULE ................................ ................................ ................................ ................................ ............ 264 Schedule J – Retail Fair Value Option/Held for Sale (FVO/HFS) ................................ ................................ ............ 266 Schedule K - Supplem ental ................................
in which it trades in its principal exchange, using the standard ISO 4217 three -letter currency code (e.g., USD, EUR, GBP, CAD, etc.). For the avoidance of doubt, whether or not the value of this fiel d is USD (U.S. dollars), all amounts reported in this schedule must be in USD -equivalent terms as of the reporting date. B.2—Securities 2 (“Investment Securities with Designated Accounting Hedges”) The Securities 2 schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities. All amounts should b e reported in U.S. dollars. Gains and losses should be reported gross of tax. In each row, report the unique ID, identifier type and identifier value using the corresponding instructions for Securities 1 for each investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship. Security holdings listed in this worksheet should be a subset of the line -by-line holdings reported in the Securities 1 schedule and use a consistent ID, Identifier Type and Identifier Value for m atching purposes. In addition, for qualifying hedging relationships reported on Securities 2, the unique ID reported for the investment security on Securities 1 must also be reported. There should be one row submitted for each distinct investment security hedging relationship. Use multiple rows to reflect one -to-many relationships: For example, if multiple hedging relationships apply to a single security holding, please list each hedgin g relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security. (This treatment would apply, for example, if distinct hedging instruments – such as interest rate and foreign exchange hedging instruments – hedge different risks of the same holding and are accounted for separately, or if a fair value hedge co -exists with a cash flow hedge to address distinct risks.) Similarly, if a portfolio hedge is used to hedge more than one security under a single hedging relationship, please list each of the hedged security holdings in separate rows alongside the characteristics and allocable amount of the associated portfolio hedging instrument. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged. Please refer to the following table for detailed instructions on each column of this worksheet. The abbreviati on ASC stands for the Financial Accounting Standards Board Accounting Standards Codification. In general, in the instructions that follow, the terms hedging instrument and hedged item follow their usage in the ASC. Note that hedging instrument may refer ei ther to a single instrument or derivative that hedges the hedged item in a hedging relationship, or a group of instruments jointly considered a hedging instrument under a single hedging relationship. Field No. Field Na me Description Allowab le Values 1 Identifier Type Report the identifier type for an investment security for which the BHC or IHC or SLHC
the purpose of selling them in the near term. Also exclude securities that have been sold, but not settled as of the quarter -end date. B.1—Securities 1 (“Main Schedule”) The Securities 1 schedule collects individual security -level details on positions, security type, cumulativ e OTTI (credit and non -credit related impairments) by security, and accounting intent (AFS or HTM). Amounts should be reported in U.S. dollars (USD). The reporting of Securities should follow balance sheet classification of the FR Y -9C (e.g., Securities w ill correspond with Schedule HC -B breakdowns or be classified as Equity securities with readily determinable fair values not held for trading included in FR Y -9C, Schedule HC, item 2.c). Any securities not specifically excluded from this schedule should be reported . Additionally, the method of reporting individual security -level information should be consistent with the level of aggregation the company uses to assess impairment and measure realized and unrealized gains and losses on investment securities un der GAAP (ASC paragraph 320 -10-35-20). 7 In circumstances whereby the BHC or IHC or SLHC holds securities in both AFS and HTM categories within a given asset class, separate each security in to separate line items. The followi ng information should be reported in this schedule. Unique ID A unique identifier must be included to identify each unique record. For a given security 6 References to credit impairment models under current U.S. GAAP that would be outside the scope of CECL will be eliminated fro m the instructions upon full adoption of ASU 2016 -13. This includes, but is not limited to, references to OTTI, ASC 310 -10, ASC 310 -30, and ASC 320 - 10. 7 In Schedule B.1, institutions that have adopted ASU 2016 -13 should report allowances for credit losses on AFS and HTM debt securities. References to OTTI, credit impairment, and ASC 310 -30 do not apply to these institutions and will be eliminated upon full adoption of ASU 2016 -13. position, the same Unique ID should be used from one period to the next. Identifier Type and Identifier Value Report individual security -level data for all available -for-sale (AFS) and held -to-maturity (HTM) securities, adding new rows as necessary. Generally, securities should always be reported with a public identifier, if available, such as a valid CUSIP, ISIN, or SEDOL. If a valid CUSIP, ISIN or SEDOL identifier e xists for the security, please report the value of the chosen identifier (the CUSIP, ISIN, or SEDOL code) and indicate the identifier type as “CUSIP”, “ISIN”, or “SEDOL”. If a CUSIP, ISIN, or SEDOL identifier is not available for a given security, please r eport an alternative public identifier value, if available, and report the identifier type. If only a n internal identifier is available and provided, please report the identifier type as “INTERNAL.” Securities where an internal identifier is reported must have additional information reported in the Security Description 2 or Security Description 3 fields that clarifies the name of the security or

Question: What are the rules that we can extract from the B.2—Securities 2 (“Investment Securities with Designated Accounting Hedges”)?

Answer: The following rules can be extracted from the B.2—Securities 2 schedule:

1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities.
2. All amounts should be reported in U.S. dollars.
3. Gains and losses should be reported gross of tax.
4. There should be one row submitted for each distinct investment security hedging relationship.
5. The unique ID reported for the investment security on Securities 1 must also be reported.
6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security.
7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged.
8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.
9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.
10. The hedging relationship type (cash flow hedge or fair value hedge)

In [None]:
import re

def generate_function_name(rule):
    """Generate a snake_case function name from a rule."""
    rule_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', rule)  # Remove special characters
    words = rule_cleaned.lower().split()[:5]  # Take first 5 words to keep it concise
    return "_".join(words)

def generate_validation_function(rule_text):
    """
    Uses the RAG model to generate a Python validation function for a given rule.
    The generated function will validate a Pandas DataFrame column.
    """
    func_name = generate_function_name(rule_text)

    prompt = f"""
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: {rule_text}

    Expected format:

    ```python
    def validate_{func_name}(df):
        \"\"\"Validation Rule: {rule_text}\"\"\"
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    """

    response = query_llm(rule_text, prompt)  # Use your RAG function to query
    return response.strip()

def process_rules_from_file(rules_file="rules.txt"):
    """Reads rules from a file, generates validation functions, and prints them."""
    with open(rules_file, "r", encoding="utf-8") as file:
        rules = file.readlines()

    for rule in rules:
        rule = rule.strip()
        if rule:
            generated_code = generate_validation_function(rule)
            print(generated_code)
            print("\n" + "="*80 + "\n")  # Separator for better readability

# Run the function
process_rules_from_file()


Expected Output

Context: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities.

    Expected format:

    ```python
    def validate_1_the_schedule_contains_information(df):
        """Validation Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_1_the_schedule_contains_information(df):
    """Validation Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities."""
    return df[df["<Column_Name_of_Hedge_Type>"] != "Expected Value"]
```

Please replace `<Column_Name>` with the actual column name in your DataFrame that contains the hedge type information. For example, it could be something like 'hedge_type'.

Also, please replace `"Expected Value"` with the actual expected value for the hedge type. It could be 'Cash Flow Hedge of AFS', 'Cash Flow Hedge of HTM', 'Fair Value Hedge of AFS', or 'Fair Value Hedge of HTM'.

================================================================================

Context: 2. All amounts should be reported in U.S. dollars.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 2. All amounts should be reported in U.S. dollars.

    Expected format:

    ```python
    def validate_2_all_amounts_should_be(df):
        """Validation Rule: 2. All amounts should be reported in U.S. dollars."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_2_all_amounts_should_be(df):
    """Validation Rule: 2. All amounts should be reported in U.S. dollars.
    Assuming the amounts are in column named 'amount_column'.
    """
    return df[df["amount_column"] != "$US"]
```

In this answer, I've assumed that the amount column name is `amount_column`. You should replace `amount_column` with the actual column name containing the currency amounts in your DataFrame. The function will return a DataFrame with rows that do not have the amount reported in U.S. dollars.

================================================================================

Context: 3. Gains and losses should be reported gross of tax.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 3. Gains and losses should be reported gross of tax.

    Expected format:

    ```python
    def validate_3_gains_and_losses_should(df):
        """Validation Rule: 3. Gains and losses should be reported gross of tax."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_3_gains_and_losses_should(df):
    """Validation Rule: 3. Gains and losses should be reported gross of tax."""
    return df[df["Gains_or_Losses"] != "Gross"]
```

Assuming the column name where gains and losses are reported is "Gains_or_Losses" and the expected value is "Gross". You should replace `"Gains_or_Losses"` with the actual column name and `"Gross"` with the expected value in your specific DataFrame.

================================================================================

Context: 4. There should be one row submitted for each distinct investment security hedging relationship.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 4. There should be one row submitted for each distinct investment security hedging relationship.

    Expected format:

    ```python
    def validate_4_there_should_be_one(df):
        """Validation Rule: 4. There should be one row submitted for each distinct investment security hedging relationship."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_4_there_should_be_one(df, column_name):
    """Validation Rule: 4. There should be one row submitted for each distinct investment security hedging relationship."""
    return df[df[column_name].duplicated()]
```

In this answer, I've used the `duplicated()` method of DataFrame to identify the duplicated rows. Since the rule states that there should be only one row for each distinct investment security hedging relationship, duplicated rows would violate the rule.

You can replace `"<Column_Name>"` with the name of the column containing the investment security hedging relationship information. The function will return a DataFrame containing only the duplicated rows, which violate the rule.

================================================================================

Context: 5. The unique ID reported for the investment security on Securities 1 must also be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported.

    Expected format:

    ```python
    def validate_5_the_unique_id_reported(df):
        """Validation Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_5_the_unique_id_reported(df, column_name_securities_1, column_name_investment_security):
    """Validation Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported."""
    return df[df[column_name_investment_security] != df.loc[0, column_name_securities_1]]
```

In this example, it is assumed that the unique ID is reported in the first row of the DataFrame for Securities 1. If that's not the case, you should modify the code accordingly.

================================================================================

Context: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security.

    Expected format:

    ```python
    def validate_6_if_multiple_hedging_relationships(df):
        """Validation Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
import pandas as pd

def validate_6_if_multiple_hedging_relationships(df, security_column, hedging_column):
    """Validation Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security."""
    duplicate_security_hedging_relationships = df[hedging_column].duplicated(subset=[security_column])
    return df[duplicate_security_hedging_relationships]
```
In this function, you should replace `"<Column_Name>"` with the actual column name for the security in your dataframe and `"Expected Value"` with the expected value for that column when multiple hedging relationships apply to a single security holding.

For instance, if the security column is called 'SecurityID' and the expected value for multiple hedging relationships is 'Multiple Hedging Relationships', you would replace `"<Column_Name>"` with `'SecurityID'` and `"Expected Value"` with `'Multiple Hedging Relationships'`.

The function works by first identifying duplicate security-hedging relationships using the `duplicated()` method and then returning the rows that have these duplicates.

================================================================================

Context: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged.

    Expected format:

    ```python
    def validate_7_if_a_hedging_instrument(df):
        """Validation Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
import pandas as pd

def validate_7_if_a_hedging_instrument(df):
    """Validation Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged."""
    return df[df["Hedging_Instrument"] != "Investment_Security"]
```

In the given answer, replace `"Hedging_Instrument"` and `"Investment_Security"` with the actual column names from your DataFrame.

================================================================================

Context: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

    Expected format:

    ```python
    def validate_8_the_identifier_type_for(df):
        """Validation Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_8_the_identifier_type_for(df):
    """Validation Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
    return df[df["Security_Identifier_Type"] != "Expected Value"]
```

================================================================================

Context: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

    Expected format:

    ```python
    def validate_9_the_identifier_value_for(df):
        """Validation Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_9_the_identifier_value_for(df):
    """Validation Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
    return df[df["investment_security_identifier"] != "Expected Value"]
```

Assumption: The column name "investment_security_identifier" is used to store the identifier value for an investment security. You should replace it with the actual column name from your DataFrame. The "Expected Value" should be replaced with the expected value based on your business logic or data source.

================================================================================

Context: 10. The hedging relationship type (cash flow hedge or fair value hedge)

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)

    Expected format:

    ```python
    def validate_10_the_hedging_relationship_type(df):
        """Validation Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)"""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_10_the_hedging_relationship_type(df):
    """Validation Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)"""
    return df[df["Hedging_Relationship_Type"] != "Cash Flow Hedge"] \
            .loc[df["Hedging_Relationship_Type"] != "Fair Value Hedge"]
```

================================================================================


In [None]:
import re

def generate_function_name(rule):
    """Generate a snake_case function name from a rule."""
    rule_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', rule)  # Remove special characters
    words = rule_cleaned.lower().split()[:5]  # Take first 5 words to keep it concise
    return "_".join(words)

def generate_validation_function(rule_text):
    """
    Uses the RAG model to generate a Python validation function for a given rule.
    The generated function will validate a Pandas DataFrame column.
    """
    func_name = generate_function_name(rule_text)

    prompt = f"""
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: {rule_text}

    Expected format:

    ```python
    def validate_{func_name}(df):
        \"\"\"Validation Rule: {rule_text}\"\"\"
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    """

    response = query_llm(rule_text, prompt)  # Use your RAG function to query
    return response.strip()

def process_rules_from_file(rules_file="rules.txt", output_file="generated_validations.py"):
    """Reads rules from a file, generates validation functions, prints them, and saves them to a Python file."""
    with open(rules_file, "r", encoding="utf-8") as file:
        rules = file.readlines()

    functions = []
    with open(output_file, "w", encoding="utf-8") as file:
        for rule in rules:
            rule = rule.strip()
            if rule:
                generated_code = generate_validation_function(rule)
                print(generated_code)  # Print to console
                print("\n" + "="*80 + "\n")  # Separator for better readability
                functions.append(generated_code)

        # Write all functions to the .py file
        file.write("\n\n".join(functions))

    print(f"✅ Validation functions saved to {output_file}")

# Run the function
process_rules_from_file()


Expected Output

Expected output

Context: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities.

    Expected format:

    ```python
    def validate_1_the_schedule_contains_information(df):
        """Validation Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_1_the_schedule_contains_information(df):
    """Validation Rule: 1. The schedule contains information on investment security hedging relationships designated under GAAP as cash flow or fair value hedges of AFS or HTM securities."""
    return df[df["<Column_Name_of_Hedge_Type>"] != "Expected Value"]
```

Please replace `<Column_Name>` with the actual column name in your DataFrame that contains the hedge type information. For example, it could be something like 'hedge_type'.

Also, please replace `"Expected Value"` with the actual expected value for the hedge type. It could be 'Cash Flow Hedge of AFS', 'Cash Flow Hedge of HTM', 'Fair Value Hedge of AFS', or 'Fair Value Hedge of HTM'.

================================================================================

Context: 2. All amounts should be reported in U.S. dollars.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 2. All amounts should be reported in U.S. dollars.

    Expected format:

    ```python
    def validate_2_all_amounts_should_be(df):
        """Validation Rule: 2. All amounts should be reported in U.S. dollars."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_2_all_amounts_should_be(df):
    """Validation Rule: 2. All amounts should be reported in U.S. dollars.
    Assuming the amounts are in column named 'amount_column'.
    """
    return df[df["amount_column"] != "$US"]
```

In this answer, I've assumed that the amount column name is `amount_column`. You should replace `amount_column` with the actual column name containing the currency amounts in your DataFrame. The function will return a DataFrame with rows that do not have the amount reported in U.S. dollars.

================================================================================

Context: 3. Gains and losses should be reported gross of tax.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 3. Gains and losses should be reported gross of tax.

    Expected format:

    ```python
    def validate_3_gains_and_losses_should(df):
        """Validation Rule: 3. Gains and losses should be reported gross of tax."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_3_gains_and_losses_should(df):
    """Validation Rule: 3. Gains and losses should be reported gross of tax."""
    return df[df["Gains_or_Losses"] != "Gross"]
```

Assuming the column name where gains and losses are reported is "Gains_or_Losses" and the expected value is "Gross". You should replace `"Gains_or_Losses"` with the actual column name and `"Gross"` with the expected value in your specific DataFrame.

================================================================================

Context: 4. There should be one row submitted for each distinct investment security hedging relationship.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 4. There should be one row submitted for each distinct investment security hedging relationship.

    Expected format:

    ```python
    def validate_4_there_should_be_one(df):
        """Validation Rule: 4. There should be one row submitted for each distinct investment security hedging relationship."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_4_there_should_be_one(df, column_name):
    """Validation Rule: 4. There should be one row submitted for each distinct investment security hedging relationship."""
    return df[df[column_name].duplicated()]
```

In this answer, I've used the `duplicated()` method of DataFrame to identify the duplicated rows. Since the rule states that there should be only one row for each distinct investment security hedging relationship, duplicated rows would violate the rule.

You can replace `"<Column_Name>"` with the name of the column containing the investment security hedging relationship information. The function will return a DataFrame containing only the duplicated rows, which violate the rule.

================================================================================

Context: 5. The unique ID reported for the investment security on Securities 1 must also be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported.

    Expected format:

    ```python
    def validate_5_the_unique_id_reported(df):
        """Validation Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_5_the_unique_id_reported(df, column_name_securities_1, column_name_investment_security):
    """Validation Rule: 5. The unique ID reported for the investment security on Securities 1 must also be reported."""
    return df[df[column_name_investment_security] != df.loc[0, column_name_securities_1]]
```

In this example, it is assumed that the unique ID is reported in the first row of the DataFrame for Securities 1. If that's not the case, you should modify the code accordingly.

================================================================================

Context: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security.

    Expected format:

    ```python
    def validate_6_if_multiple_hedging_relationships(df):
        """Validation Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
import pandas as pd

def validate_6_if_multiple_hedging_relationships(df, security_column, hedging_column):
    """Validation Rule: 6. If multiple hedging relationships apply to a single security holding, please list each hedging relationship affecting the security in a separate row of the Securities 2 file, repeating relevant details about the hedged security."""
    duplicate_security_hedging_relationships = df[hedging_column].duplicated(subset=[security_column])
    return df[duplicate_security_hedging_relationships]
```
In this function, you should replace `"<Column_Name>"` with the actual column name for the security in your dataframe and `"Expected Value"` with the expected value for that column when multiple hedging relationships apply to a single security holding.

For instance, if the security column is called 'SecurityID' and the expected value for multiple hedging relationships is 'Multiple Hedging Relationships', you would replace `"<Column_Name>"` with `'SecurityID'` and `"Expected Value"` with `'Multiple Hedging Relationships'`.

The function works by first identifying duplicate security-hedging relationships using the `duplicated()` method and then returning the rows that have these duplicates.

================================================================================

Context: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged.

    Expected format:

    ```python
    def validate_7_if_a_hedging_instrument(df):
        """Validation Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
import pandas as pd

def validate_7_if_a_hedging_instrument(df):
    """Validation Rule: 7. If a hedging instrument hedges an investment security and also hedges assets that are not investment securities, report the amount allocable to the investment security (or securities) being hedged."""
    return df[df["Hedging_Instrument"] != "Investment_Security"]
```

In the given answer, replace `"Hedging_Instrument"` and `"Investment_Security"` with the actual column names from your DataFrame.

================================================================================

Context: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

    Expected format:

    ```python
    def validate_8_the_identifier_type_for(df):
        """Validation Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_8_the_identifier_type_for(df):
    """Validation Rule: 8. The identifier type for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
    return df[df["Security_Identifier_Type"] != "Expected Value"]
```

================================================================================

Context: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported.

    Expected format:

    ```python
    def validate_9_the_identifier_value_for(df):
        """Validation Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_9_the_identifier_value_for(df):
    """Validation Rule: 9. The identifier value for an investment security for which the BHC or IHC or SLHC has an existing qualifying hedging relationship should be reported."""
    return df[df["investment_security_identifier"] != "Expected Value"]
```

Assumption: The column name "investment_security_identifier" is used to store the identifier value for an investment security. You should replace it with the actual column name from your DataFrame. The "Expected Value" should be replaced with the expected value based on your business logic or data source.

================================================================================

Context: 10. The hedging relationship type (cash flow hedge or fair value hedge)

Question:
    Convert the following rule into a Python validation function.
    The function should validate a Pandas DataFrame and return rows that violate the rule.

    Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)

    Expected format:

    ```python
    def validate_10_the_hedging_relationship_type(df):
        """Validation Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)"""
        return df[df["<Column_Name>"] != "Expected Value"]
    ```
    

Answer:

```python
def validate_10_the_hedging_relationship_type(df):
    """Validation Rule: 10. The hedging relationship type (cash flow hedge or fair value hedge)"""
    return df[df["Hedging_Relationship_Type"] != "Cash Flow Hedge"] \
            .loc[df["Hedging_Relationship_Type"] != "Fair Value Hedge"]
```

================================================================================

✅ Validation functions saved to generated_validations.py