# Missing Data

In [1]:
# Setup and mock data
import pandas as pd
import numpy as np

# Generate mock data with missing values
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [25, np.nan, 22, 35, np.nan, 30, 28, np.nan, 32, 40],
    'Salary': [50000, 54000, np.nan, 62000, np.nan, 58000, 52000, 60000, np.nan, 67000],
    'Bonus':  [100000, np.nan, np.nan, np.nan, np.nan, np.nan, 52000, np.nan, np.nan, 67000],
    'Department': ['HR', 'IT', 'IT', np.nan, 'Sales', np.nan, 'HR', 'IT', 'Sales', 'HR']
}

df = pd.DataFrame(data)
print("Sample Data with Missing Values for both a numerical and a Categorical variable:")
print(df)

Sample Data with Missing Values for both a numerical and a Categorical variable:
   ID   Age   Salary     Bonus Department
0   1  25.0  50000.0  100000.0         HR
1   2   NaN  54000.0       NaN         IT
2   3  22.0      NaN       NaN         IT
3   4  35.0  62000.0       NaN        NaN
4   5   NaN      NaN       NaN      Sales
5   6  30.0  58000.0       NaN        NaN
6   7  28.0  52000.0   52000.0         HR
7   8   NaN  60000.0       NaN         IT
8   9  32.0      NaN       NaN      Sales
9  10  40.0  67000.0   67000.0         HR


In [2]:
class MissingDataHandler:
    #def __init__(self, df):
      #  self.df = df

    def detect_missing(self,df):
        print(df.isnull().sum())

    def drop_missing_columns(self, df, threshold = 0.5):
        """
        Drops columns from input DataFrame that have more than a given share 
        threshold (default = 50%) of its values missing.

        Parameters:
            df : pd.DataFrame
                The input DataFrame
            threshold: float
                Value between 0 and 1. Represent the fraction of missing values 
                above which colums are dropped. Default is 0.5

        Returns
            pd.DataFrame
                Modified DataFrame with colums dropped if they exceed the
                threshold of missing values
        """

        # Calculate fraction of missing values for each column
        missing_fraction = df.isnull().mean(axis=0)
        # Map colums where missing fraction exceeds threshold
        columns_to_drop =  missing_fraction[missing_fraction > threshold].index
        # Drop mapped colums
        df_cleaned = df.drop(columns = columns_to_drop)
        return df_cleaned

    def drop_missing_rows(self, df, threshold = 0.5):
        """
        Drops columns from input DataFrame that have more than a given share 
        threshold (default = 50%) of its values missing.

        Parameters:
            df : pd.DataFrame
                The input DataFrame
            threshold: float
                Value between 0 and 1. Represent the fraction of missing values 
                above which colums are dropped. Default is 0.5

        Returns
            pd.DataFrame
                Modified DataFrame with rows dropped if they exceed the
                threshold of missing values
        """
        # Calculate fraction of missing values for each row
        missing_fraction = df.isnull().mean(axis=1)
        # Map rows where missing fraction DO NOT exceeds threshold
        rows_to_keep = missing_fraction <= threshold 
        # Keep mapped colums
        df_cleaned = df[rows_to_keep]

    def impute_missing_numerical(self, df, method="mean"):
        """
        Inputs missing values for numerical variables using either the mean or 
        the median of each column.

        Parameters:
            df : pd.DataFrame
                The input DataFrame

            method: str
                The method used for inputation: 'mean' or 'median'. Default is 
                median

        Returns 
            pd.DataFrame
                Modified DataFrame with missing values filled. 
        """

        if method not in ["mean", "median"]:
            raise ValueError("Method must be one of mode or new_category")

        if method == "mean":
            fill_values = df.select_dtypes(include=["float64","int64"]).mean(axis=0)
        else:
            fill_values = df.select_dtypes(include=["float64" ,"int64"]).median(axis=0)
        
        df_filled = df.fillna(fill_values)
        
        return df_filled
        
    def impute_missing_categorical(self, df, method = "mode"):
        """
        Inputs missing values for categorical variables using either the model or 
        attributing a new category.

        Parameters:
            df : pd.DataFrame
                The input DataFrame.

            method: str
                The method used for inputation: 'mode' or 'new_category'. Default 
                is median.

        Returns 
            pd.DataFrame
                Modified DataFrame with missing categorical values filled. 
        """

        if method not in ["mode", "new_category"]:
            raise ValueError("Method must be one of mode or new_category")
        
        df_filled = df.copy()

        if method == "mode":
            mode_values = df.select_dtypes(include=["object","category"]).mode().iloc[0]
            df_filled = df.fillna(mode_values)
        else:
            for col in df.select_dtypes(include=["object", "category"]).columns:
                df_filled[col] = df[col].fillna("missing")
        return df_filled


               

In [3]:
new_df = MissingDataHandler()
new_df.impute_missing_categorical(df)

Unnamed: 0,ID,Age,Salary,Bonus,Department
0,1,25.0,50000.0,100000.0,HR
1,2,,54000.0,,IT
2,3,22.0,,,IT
3,4,35.0,62000.0,,HR
4,5,,,,Sales
5,6,30.0,58000.0,,HR
6,7,28.0,52000.0,52000.0,HR
7,8,,60000.0,,IT
8,9,32.0,,,Sales
9,10,40.0,67000.0,67000.0,HR


In [4]:
new_df = MissingDataHandler()
new_df.impute_missing_categorical(df, method ="new_category")

Unnamed: 0,ID,Age,Salary,Bonus,Department
0,1,25.0,50000.0,100000.0,HR
1,2,,54000.0,,IT
2,3,22.0,,,IT
3,4,35.0,62000.0,,missing
4,5,,,,Sales
5,6,30.0,58000.0,,missing
6,7,28.0,52000.0,52000.0,HR
7,8,,60000.0,,IT
8,9,32.0,,,Sales
9,10,40.0,67000.0,67000.0,HR
