In [1]:
import pandas as pd
import random
import string

In [2]:
# Function to inject anomalies into the dataset
def inject_anomalies(csv_path, output_path, anomaly_fraction=0.1):
    # Load the dataset
    df = pd.read_csv(csv_path)

    # Ensure the dataset contains the required fields
    required_fields = ['fldInvoiceAmount', 'fldVendorName', 'fldTransactions']
    for field in required_fields:
        if field not in df.columns:
            raise ValueError(f"Missing required field: {field}")

    # Add the Anomaly column
    df['A_InvoiceAmount'] = 'normal'
    df['A_VendorName'] = 'normal'
    df['A_Currency'] = 'normal'
    df['Anomaly'] = 'normal'

    # Calculate the number of anomalies to inject
    num_anomalies = int(len(df) * anomaly_fraction)

    # Inject anomalies in fldInvoiceAmount (add large random amounts)
    anomaly_indices = random.sample(range(len(df)), num_anomalies // 3)
    for idx in anomaly_indices:
        df.at[idx, 'fldInvoiceAmount'] += random.uniform(10000, 100000)  # Add large amounts
        df.at[idx, 'Anomaly'] = 'anomaly'
        df.at[idx, 'A_InvoiceAmount'] = 'anomaly'

    # Inject anomalies in fldVendorName (add typos or new words)
    anomaly_indices = random.sample(range(len(df)), num_anomalies // 3)
    for idx in anomaly_indices:
        original_name = df.at[idx, 'fldVendorName']
        if pd.notnull(original_name):  # Only process non-null values
            if random.random() > 0.5:
                # Typo in fldVendorName
                typo_index = random.randint(0, len(original_name) - 1)
                typo_char = random.choice(string.ascii_letters)
                df.at[idx, 'fldVendorName'] = (original_name[:typo_index] + typo_char + 
                                            original_name[typo_index + 1:])
            else:
                # Add new word to fldVendorName
                new_word = ''.join(random.choices(string.ascii_lowercase, k=5))
                df.at[idx, 'fldVendorName'] = original_name + " " + new_word
            df.at[idx, 'Anomaly'] = 'anomaly'
            df.at[idx, 'A_VendorName'] = 'anomaly'

    # Inject anomalies in fldTransactions (change for specific vendors)
    anomaly_indices = random.sample(range(len(df)), num_anomalies // 3)
    unique_currencies = df['fldTransactions'].dropna().unique().tolist()
    additional_currencies = ['YEN', 'INR', 'JPY', 'AUD', 'CAD', 'IDR', 'MYR', 'VND']
    unique_currencies.extend(additional_currencies)

    for idx in anomaly_indices:
        original_currency = df.at[idx, 'fldTransactions']
        if pd.notnull(original_currency):  # Only process non-null values
            possible_currencies = [c for c in unique_currencies if c != original_currency]
            if possible_currencies:
                df.at[idx, 'fldTransactions'] = random.choice(possible_currencies)
                df.at[idx, 'Anomaly'] = 'anomaly'
                df.at[idx, 'A_Currency'] = 'anomaly'

    # Save the modified dataset
    df.to_csv(output_path, index=False)
    print(f"Anomalies injected and saved to {output_path}")
    

In [3]:
# Example usage
csv_path = 'dataset.csv'  # Path to the input dataset
output_path = 'output_with_anomalies.csv'  # Path to save the dataset with anomalies
inject_anomalies(csv_path, output_path, anomaly_fraction=0.1)

Anomalies injected and saved to output_with_anomalies.csv
