## 1- import and reading data

In [6]:
import sys
sys.path.append('../../../scripts/utilities')
from helper_functions import *
sys.path.append('../../../scripts/data_preprocessing')
from data_cleaning import *

In [7]:
base_path = '../../../data/processed_data/'
df_filling_missing_values_with_median = read_files('df_filling_missing_values_with_median.csv', base_path=base_path)[0]

In [11]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Create a sample DataFrame
data = {
    'Feature1': [1, 2, 500, 4, 5, 6, 7, 8, 9,10],
    'Feature2': [10, 20, 30, 40, 50, 60, 70, 80, 90, 1000],
}

df = pd.DataFrame(data)

Define numerical features
numerical_columns = extract_numerical_columns(df_filling_missing_values_with_median)
numerical_columns.remove('SEQN')

# Function to extract outliers using Z-score
def extract_outliers_zscore(df, features, threshold=3):
    z_scores = np.abs(zscore(df[features]))
    outliers_mask = (z_scores > threshold).any(axis=1)
    outliers = df[outliers_mask]
    return outliers

# Function to remove outliers using Z-score
def remove_outliers_zscore(df, features, threshold=3):
    z_scores = np.abs(zscore(df[features]))
    no_outliers_mask = (z_scores <= threshold).all(axis=1)
    df_no_outliers = df[no_outliers_mask]
    return df_no_outliers

# Function to extract outliers using IQR
def extract_outliers_iqr(df, features, multiplier=1.5):
    q1 = df[features].quantile(0.25)
    q3 = df[features].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q3 + multiplier * iqr
    outliers_mask = ((df[features] < lower_bound) | (df[features] > upper_bound)).any(axis=1)
    outliers = df[outliers_mask]
    return outliers

# Function to remove outliers using IQR
def remove_outliers_iqr(df, features, multiplier=1.5):
    q1 = df[features].quantile(0.25)
    q3 = df[features].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q3 + multiplier * iqr
    no_outliers_mask = ((df[features] >= lower_bound) & (df[features] <= upper_bound)).all(axis=1)
    df_no_outliers = df[no_outliers_mask]
    return df_no_outliers

# Example usage
# Extract outliers using Z-score
outliers_zscore = extract_outliers_zscore(df, numerical_columns)

# Remove outliers using Z-score
df_no_outliers_zscore = remove_outliers_zscore(df, numerical_columns)

# Extract outliers using IQR
outliers_iqr = extract_outliers_iqr(df, numerical_columns)

# Remove outliers using IQR
df_no_outliers_iqr = remove_outliers_iqr(df, numerical_columns)

# Display the results
print("Original DataFrame:")
print(df_filling_missing_values_with_median)

print("\nOutliers (Z-score):")
print(outliers_zscore)

print("\nDataFrame without Outliers (Z-score):")
print(df_no_outliers_zscore)

print("\nOutliers (IQR):")
print(outliers_iqr)

print("\nDataFrame without Outliers (IQR):")
print(df_no_outliers_iqr)

Original DataFrame:
        SEQN  SDDSRVYR  RIDSTATR  RIAGENDR  RIDAGEYR  RIDRETH1  RIDRETH3  \
0      73557       8.0       2.0       1.0      69.0       4.0       4.0   
1      73558       8.0       2.0       1.0      54.0       3.0       3.0   
2      73559       8.0       2.0       1.0      72.0       3.0       3.0   
3      73560       8.0       2.0       1.0      26.0       3.0       3.0   
4      73561       8.0       2.0       2.0      73.0       3.0       3.0   
...      ...       ...       ...       ...       ...       ...       ...   
10170  83727       8.0       2.0       1.0      26.0       2.0       2.0   
10171  83728       8.0       2.0       2.0       2.0       1.0       1.0   
10172  83729       8.0       2.0       2.0      42.0       4.0       4.0   
10173  83730       8.0       2.0       1.0      26.0       2.0       2.0   
10174  83731       8.0       2.0       1.0      11.0       5.0       6.0   

       RIDEXMON  DMQMILIZ  DMDBORN4  ...  LBDBSELC  LBXBMN  LBDBMNS