In [1]:
import numpy as np
from scipy.stats import zscore
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [16]:
stroke_data = pd.read_csv('stroke_data.csv')
hypertension_data = pd.read_csv('hypertension_data.csv')
diabetes_data = pd.read_csv('diabetes_data.csv')

In [26]:
def impute_missing_values(data, strategy='mean'):
    """ Impute missing values using a specified strategy ('mean', 'median'). """
    imputer = SimpleImputer(strategy=strategy)
    data[:] = imputer.fit_transform(data)
    return data

def remove_outliers_zscore(data):
    """ Remove outliers based on the Z-score method. """
    z_scores = np.abs(zscore(data))
    filtered_entries = ((z_scores < 4) | np.isnan(z_scores)).all(axis=1)
    return data[filtered_entries]

def standardize_data(data):
    """ Standardize data to have zero mean and unit variance. """
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    return data

def cleaning(data, columns, standarize = False):
    """ Combine all functions. """
    data_no_missing = impute_missing_values(data, 'median')
    data_no_outliers = remove_outliers_zscore(data_no_missing)
    
    if standarize:
        standardized_data = standardize_data(data_no_outliers)
        return pd.DataFrame(standardized_data, columns = columns)
    
    return pd.DataFrame(data_no_outliers, columns = columns)

In [27]:
diabetes_clean = cleaning(diabetes_data, diabetes_data.columns)
stroke_clean = cleaning(stroke_data, stroke_data.columns)
hypertension_clean = cleaning(hypertension_data, hypertension_data.columns)

In [28]:
diabetes_clean.to_csv('diabetes_clean.csv', index = False)
stroke_clean.to_csv('stroke_clean.csv', index = False)
hypertension_clean.to_csv('hypertension_clean.csv', index = False)

In [29]:
stroke_clean = pd.read_csv('stroke_clean.csv')
hypertension_clean = pd.read_csv('hypertension_clean.csv')
diabetes_clean = pd.read_csv('diabetes_clean.csv')

In [33]:
diabetes_clean["CholCheck"].unique()

array([1.])