In [7]:
import pandas as pd
import numpy as np
import os
import random

In [13]:
def inject_errors_heart_data(df, n):
    num_rows, num_cols = df.shape

    # Ensure n is not greater than the total number of rows in the dataset
    n = min(n, num_rows)

    # 1. Introduce unrealistic values in n rows
    for _ in range(n):
        unrealistic_col = random.choice(['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'])
        unrealistic_row = random.randint(0, num_rows - 1)

        unrealistic_values = {
            'Age': (150, 300),
            'RestingBP': (400, 800),
            'Cholesterol': (1000, 1500),
            'MaxHR': (500, 800),
            'Oldpeak': (50, 60)
        }

        unrealistic_value = random.randint(*unrealistic_values[unrealistic_col])
        df.at[unrealistic_row, unrealistic_col] = unrealistic_value

    # 2. Introduce inconsistent types (strings in numeric columns) for n rows
    for _ in range(n):
        inconsistent_col = random.choice(['Age', 'RestingBP', 'FastingBS', 'Cholesterol', 'MaxHR', 'Oldpeak'])
        inconsistent_row = random.randint(0, num_rows - 1)
        df.at[inconsistent_row, inconsistent_col] = "Data Not Found"

    # 3. Introduce missing values (NaN) for n rows
    for _ in range(n):
        missing_value_col = random.choice(df.columns)
        missing_value_row = random.randint(0, num_rows - 1)
        df.at[missing_value_row, missing_value_col] = np.nan

    # 4. Replace existing rows with duplicated ones (without adding new rows)
    for _ in range(n):
        duplicate_row_index = random.randint(0, num_rows - 1)  # Pick a random row index to duplicate
        duplicated_row = df.iloc[duplicate_row_index]  # Select the row
        replace_row_index = random.randint(0, num_rows - 1)  # Pick a random row to replace
        df.iloc[replace_row_index] = duplicated_row.values  # Replace the row with the duplicated row

    # 5. Data Misalignment errors
    for _ in range(n):
        invalid_cat_col = random.choice(['Sex', 'ChestPainType', 'ExerciseAnginal', 'ST_Slope'])
        invalid_cat_row = random.randint(0, num_rows - 1)
        invalid_value = random.choice(['Either', 'ATA', 'N', 'Flat'])
        df.at[invalid_cat_row, invalid_cat_col] = invalid_value

    # 6. Introduce negative values in numeric columns for n rows
    for _ in range(n):
        negative_value_col = random.choice(['RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'])
        negative_value_row = random.randint(0, num_rows - 1)
        df.at[negative_value_row, negative_value_col] = -1000

    # 7. Introduce out-of-range values (e.g., MaxHR too high) for n rows
    for _ in range(n):
        out_of_range_col = random.choice(['MaxHR', 'Oldpeak'])
        out_of_range_row = random.randint(0, num_rows - 1)

        out_of_range_values = {
            'MaxHR': (201, 500),  # MaxHR should not go beyond 200 for most people
            'Oldpeak':(10, 20)  # Oldpeak is usually between 0 and 10
        }

        out_of_range_value = random.randint(*out_of_range_values[out_of_range_col])
        df.at[out_of_range_row, out_of_range_col] = out_of_range_value

    # 8. Introduce outdated or biologically unrealistic values for n rows
    for _ in range(n):
        outdated_col = random.choice(['Cholesterol'])
        outdated_row = random.randint(0, num_rows - 1)
        bio_unrealistic_values = random.randint(0, 20)  # Cholesterol level shouldn't be 0
        df.at[outdated_row, outdated_col] = bio_unrealistic_values

    # 9. Mix data types in the same column for n rows
    for _ in range(n):
        mixed_type_col = random.choice(['Age', 'RestingBP', 'MaxHR'])
        mixed_type_row = random.randint(0, num_rows - 1)

        if df[mixed_type_col].dtype in [np.int64, np.float64]:
            df.at[mixed_type_row, mixed_type_col] = np.float64(df.at[mixed_type_row, mixed_type_col])

    # 10. Introduce mismatched values for n rows (e.g., age 15 and asymptomatic chest pain)
    for _ in range(n):
        mismatched_row = random.randint(0, num_rows - 1)
        
        # Set age to an unrealistically low value, but chest pain to a type typically for older people
        df.at[mismatched_row, 'Age'] = 15
        df.at[mismatched_row, 'ChestPainType'] = 'Asymptomatic'  # Typically not for young people

    return df

  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"
  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"
  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"
  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"
  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"
  df.at[inconsistent_row, inconsistent_col] = "Data Not Found"


In [None]:
# Load dataset
file_path = r"D:\epita class notes\dsp production\project script\raw_dataset\heart.csv"
df = pd.read_csv(file_path)
rows = df.shape[0]

# Inject errors into 25% of the rows
num_error_rows = int(0.25 * rows)  # Calculate 25% of the total rows
df_with_errors = inject_errors_heart_data(df, n=num_error_rows)

# Save the modified dataset to a new file (uncomment to save)
# df_with_errors.to_csv("C:\\Users\\edwin victor\\git repositories\\dsp-heart-failure-prediction\\data\\heart_with_errors_25_percent.csv", index=False)


In [8]:
os.getcwd()

'C:\\Users\\edwin victor\\git repositories\\dsp-heart-failure-prediction\\python_files'

In [9]:
os.listdir()

['.ipynb_checkpoints',
 'data_corruption_notebook .ipynb',
 'Heart-Failure-Prediction-modeling (2).ipynb',
 'heart_failure_prediction.ipynb']

In [12]:
path = os.path.dirname(os.getcwd())
os.listdir(path)

['.git',
 '.gitattributes',
 'airflow',
 'api',
 'data',
 'models',
 'python_files',
 'README.md',
 'webapp']