#Import Libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from scipy import stats


#Variables

In [21]:
raw_file_path = 'raw_healthcare_dataset.csv'
clean_file_path = 'cleaned_healthcare_dataset.csv'
# Add more variables as needed

#Features Mapping

In [22]:
feature_combinations = {
    ("Age", "Billing Amount"): lambda df, col1, col2: df[col1] * df[col2],
    ("Room Number", "Admission Type"): lambda df, col1, col2: df[col1].astype(str) + "_" + df[col2],
    ("Age", "Room Number", "Billing Amount"): lambda df, col1, col2, col3: df[col1] + df[col2] + df[col3],
    # Add more features as needed
}


#Data Ingestion

In [23]:
def ingest_data(file_path):
    try:
        data = pd.read_csv(file_path)
        print(f"Data loaded successfully with {data.shape[0]} rows and {data.shape[1]} columns.")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

#Initial Data Exploration

In [24]:
def initial_exploration(data):
    print("First 5 Rows:\n", data.head())
    print("Data Info:\n", data.info())
    print("Statistical Summary:\n", data.describe())
    return data.isnull().sum()


#Data Validation

In [25]:
def validate_data(data, schema):
    for col, dtype in schema.items():
        if col in data.columns:
            if not data[col].dtype == dtype:
                print(f"Invalid data type in column {col}. Expected {dtype}, got {data[col].dtype}.")
    print("Data validation completed.")


#Handling Missing Values

In [26]:
def handle_missing_values(data, strategy="mean"):
    if data is None:
        return None

    numeric_data = data.select_dtypes(include=[np.number])
    non_numeric_data = data.select_dtypes(exclude=[np.number])
    
    imputer = SimpleImputer(strategy=strategy)
    numeric_data_imputed = pd.DataFrame(imputer.fit_transform(numeric_data), columns=numeric_data.columns)
    
    return pd.concat([numeric_data_imputed, non_numeric_data], axis=1)

#Data Manipulation (Feature Transformation, Deriving New Features)

In [27]:
def manipulate_data(data):
    if data is None:
        return None

    for columns, operation in feature_combinations.items():
        if all(col in data.columns for col in columns):
            new_feature_name = "_".join(col.lower() for col in columns) + "_interaction"
            
            # Apply the transformation and handle columns within the lambda
            data[new_feature_name] = operation(data, *columns)
            print(f"Created new feature: {new_feature_name}")
        else:
            missing_cols = [col for col in columns if col not in data.columns]
            print(f"Skipping feature {columns} due to missing columns: {missing_cols}")

    return data


#Outlier Detection and Treatment

In [28]:
def treat_outliers(data, threshold=3):
    if data is None:
        return None

    numeric_data = data.select_dtypes(include=[np.number])
    z_scores = np.abs(stats.zscore(numeric_data))
    data_no_outliers = data[(z_scores < threshold).all(axis=1)]
    print(f"Outliers removed. New data shape: {data_no_outliers.shape}")
    return data_no_outliers


#Data Encoding

In [29]:
def encode_data(data):
    if data is None:
        return None

      
    # Separate high-cardinality columns for alternative encoding strategies if needed
    categorical_columns = data.select_dtypes(include=["object"]).columns
    high_cardinality_cols = [col for col in categorical_columns if data[col].nunique() > 50]  # Customize this threshold
    low_cardinality_cols = [col for col in categorical_columns if data[col].nunique() <= 50]

    # Use OneHotEncoder for low-cardinality columns only
    encoder = OneHotEncoder(sparse_output=True, drop='first')
    low_cardinality_data = pd.DataFrame(encoder.fit_transform(data[low_cardinality_cols]).toarray(), columns=encoder.get_feature_names_out())
    
    # Optionally, you could use frequency encoding for high-cardinality columns:
    for col in high_cardinality_cols:
        data[col] = data[col].map(data[col].value_counts(normalize=True))
    
    # Drop original categorical columns
    data = data.drop(columns=categorical_columns)
    
    # Concatenate encoded and original data
    data = pd.concat([data.reset_index(drop=True), low_cardinality_data.reset_index(drop=True)], axis=1)
    return data


#Data Transformation and Normalization

In [30]:
def transform_data(data, scaler_type="standard"):
    if data is None:
        return None

    scaler = StandardScaler() if scaler_type == "standard" else MinMaxScaler()
    numeric_data = data.select_dtypes(include=[np.number])
    scaled_data = pd.DataFrame(scaler.fit_transform(numeric_data), columns=numeric_data.columns)
    return pd.concat([scaled_data, data.select_dtypes(exclude=[np.number])], axis=1)


#Feature Engineering

In [31]:
def feature_engineering(data):
    if data is None:
        return None

    numeric_data = data.select_dtypes(include=[np.number])
    pca = PCA(n_components=min(5, numeric_data.shape[1]))
    principal_components = pca.fit_transform(numeric_data)
    data_pca = pd.DataFrame(principal_components, columns=[f"PCA_{i}" for i in range(1, principal_components.shape[1] + 1)])
    return pd.concat([data, data_pca], axis=1)

#Data Deduplication

In [32]:
def deduplicate_data(data):
    if data is None:
        return None

    try:
        data_deduped = data.drop_duplicates()
        print(f"Duplicates removed. New data shape: {data_deduped.shape}")
        return data_deduped
    except Exception as e:
        print(f"Duplicate record not found or Error in removing duplicate: {e}")

#Data Export

In [33]:
def export_data(data, file_path):
    if data is None:
        print("No data to export.")
        return
    
    try:
        data.to_csv(file_path, index=False)
        print(f"Data exported successfully to {file_path}.")
    except Exception as e:
        print(f"Error exporting data: {e}")


#Error Handling and Logging

In [34]:
import logging

logging.basicConfig(filename='data_cleaning.log', level=logging.INFO)

def log_and_handle_errors(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logging.error(f"Error in {func.__name__}: {e}")
            print(f"Error in {func.__name__}: {e}")
            return None
    return wrapper


In [35]:
ingest_data = log_and_handle_errors(ingest_data)
initial_exploration = log_and_handle_errors(initial_exploration)
validate_data = log_and_handle_errors(validate_data)
handle_missing_values = log_and_handle_errors(handle_missing_values)
manipulate_data = log_and_handle_errors(manipulate_data)
treat_outliers = log_and_handle_errors(treat_outliers)
encode_data = log_and_handle_errors(encode_data)
transform_data = log_and_handle_errors(transform_data)
feature_engineering = log_and_handle_errors(feature_engineering)
deduplicate_data = log_and_handle_errors(deduplicate_data)
export_data = log_and_handle_errors(export_data)


#Main Pipeline Function

In [36]:
def data_cleaning_pipeline(file_path, schema, export_path):
    data = ingest_data(file_path)
    if data is None:
        return
    
    initial_exploration(data)
    validate_data(data, schema)
    
    data = handle_missing_values(data)
    data = manipulate_data(data) if data is not None else None
    data = treat_outliers(data) if data is not None else None
    data = encode_data(data) if data is not None else None
    data = transform_data(data, scaler_type="standard") if data is not None else None
    data = feature_engineering(data) if data is not None else None
    data = deduplicate_data(data) if data is not None else None
    
    export_data(data, export_path)
    print("Data cleaning pipeline completed successfully.")

In [37]:
def auto_generate_schema(file_path):
    # Load the dataset
    data = pd.read_csv(file_path)

    # Generate schema dictionary with column name as key and data type as value
    schema = {col: str(data[col].dtype) for col in data.columns}
    
    print("Generated Schema:")
    print(schema)
    return schema

# Generate schema
schema = auto_generate_schema(raw_file_path)

Generated Schema:
{'Name': 'object', 'Age': 'int64', 'Gender': 'object', 'Blood Type': 'object', 'Medical Condition': 'object', 'Date of Admission': 'object', 'Doctor': 'object', 'Hospital': 'object', 'Insurance Provider': 'object', 'Billing Amount': 'float64', 'Room Number': 'int64', 'Admission Type': 'object', 'Discharge Date': 'object', 'Medication': 'object', 'Test Results': 'object'}


#Run data_cleaning_pipeline

In [38]:
data_cleaning_pipeline(raw_file_path, schema, clean_file_path)

Data loaded successfully with 55500 rows and 15 columns.
First 5 Rows:
             Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Nu