# Missing Data Handeling
Author: Eri G Osta

Date: April 5, 2023

## MissingDataHandler Class
The `MissingDataHandler` class is a data analysis tool that can handle missing data in a given dataset. It has methods to perform mean substitution, simple regression, and multiple imputation to impute the missing values. It also has methods to calculate statistics, compare statistics between the original and imputed data, and produce a correlation matrix with missing data information.

The class can be initialized with a filename of a CSV file that contains the dataset. The CSV file should have columns of numerical data, and missing values should be represented as NaNs. Once initialized, the user can call various methods of the class to perform the desired analysis.

In [118]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.impute import IterativeImputer
import pandas as pd
from scipy.stats import pearsonr

class MissingDataHandler:
    """
    Attributes
    ----------
    df : pandas.DataFrame
        The original dataset.
    mean_imputed_df : pandas.DataFrame
        The dataset after mean substitution.
    simple_imputed_df : pandas.DataFrame
        The dataset after simple regression imputation.
    multi_imputed_df : pandas.DataFrame
        The dataset after multiple imputation.

    Methods
    -------
    calculate_statistics()
        Calculate mean and standard deviation of each column in the dataset.
    mean_substitution()
        Perform mean substitution on the dataset.
    simple_regression()
        Perform simple regression imputation on the dataset.
    multiple_imputation()
        Perform multiple imputation on the dataset.
    compare_statistics(imputed_df, original_df)
        Compare mean and standard deviation of each column in the original dataset and the imputed dataset.
    descriptive_statistics(df)
        Print descriptive statistics for the dataset.
    correlation_matrix(df)
        Calculate the correlation matrix with Pearson correlation coefficients with their p-values and missing data.
    """
    
    def __init__(self, filename):
        """
        Parameters
        ----------
        filename : str
            The name of the CSV file containing the dataset.
        """
        self.df = pd.read_csv(filename)
        self.mean_imputed_df = None
        self.simple_imputed_df = None
        self.multi_imputed_df = None
    
    def calculate_statistics(self):
        """Calculate mean and standard deviation of each column in the dataset."""
        stats = pd.DataFrame({'mean': self.df.mean(), 'std': self.df.std()})
        print(stats)
    
    def mean_substitution(self):
        """Perform mean substitution on the dataset."""
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        mean_imputed_df = pd.DataFrame(imputer.fit_transform(self.df), columns=self.df.columns)
        self.mean_imputed_df = mean_imputed_df
        return mean_imputed_df
        
    def simple_regression(self):
        """Perform simple regression imputation on the dataset."""
        reg = LinearRegression()
        df = self.df.copy()
        df.dropna(subset=['v3_miss'], inplace=True)  # drop rows with missing values in v3_miss
        y = df['v3_miss']
        X = df[['v1_miss', 'v2_miss', 'v4_miss', 'v5_miss']]
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
        reg.fit(X_imputed, y)
        X_test = df[['v1_miss', 'v2_miss', 'v4_miss', 'v5_miss']]
        X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
        y_pred = pd.Series(reg.predict(X_test_imputed), index=df.index)
        simple_imputed_df = df.copy()
        simple_imputed_df['v3_miss'] = np.where(df['v3_miss'].isna(), y_pred, df['v3_miss'])
        self.simple_imputed_df = simple_imputed_df
        return simple_imputed_df
    
    def multiple_imputation(self):
        """Perform multiple imputation on the dataset."""
        imputer = IterativeImputer()
        multi_imputed_df = pd.DataFrame(imputer.fit_transform(self.df), columns=self.df.columns)
        self.multi_imputed_df = multi_imputed_df
        return multi_imputed_df
    
    def compare_statistics(self, imputed_df, original_df):
        orig_stats = pd.DataFrame({'mean': original_df.mean(), 'std': original_df.std()}, index=original_df.columns)
        new_stats = pd.DataFrame({'mean': imputed_df.mean(), 'std': imputed_df.std()}, index=imputed_df.columns)
        print('Original:\n', orig_stats)
        print('New:\n', new_stats)
    
    def descriptive_statistics(self, df):
        print('Descriptive Statistics:')
        print(df.describe())
        
    def correlation_matrix(self, df):
        """Prints a correlation matrix with Pearson correlation coefficients and corresponding p-values and missing data.

        Args:
            df (pandas.DataFrame): The DataFrame to calculate the correlation matrix from.

        Returns:
            pandas.DataFrame: The correlation matrix with Pearson correlation coefficients and corresponding p-values and missing data.
        """
        corr_matrix = pd.DataFrame(index=df.columns, columns=df.columns, dtype=np.float64)
        p_values = pd.DataFrame(index=df.columns, columns=df.columns, dtype=np.float64)
        missing_values = pd.DataFrame(index=df.columns, columns=df.columns, dtype=np.int64)
        for i, col_i in enumerate(df.columns):
            for j, col_j in enumerate(df.columns):
                data_i = df[col_i].dropna()
                data_j = df[col_j].dropna()
                intersection = data_i.index.intersection(data_j.index)
                n_missing = len(df) - df[[col_i, col_j]].notna().all(axis=1).sum()
                if len(intersection) > 1:
                    corr, p = pearsonr(data_i[intersection], data_j[intersection])
                else:
                    corr, p = np.nan, np.nan
                corr_matrix.iloc[i, j] = corr
                p_values.iloc[i, j] = p
                missing_values.iloc[i, j] = n_missing
        corr_matrix = corr_matrix.round(2)
        p_values = p_values.round(3)
        missing_values = missing_values.astype(str).replace('\.0$', '', regex=True)
        result = corr_matrix.astype(str) + ' (p-value: ' + p_values.astype(str) + ' | missing: ' + 'missing: ' + missing_values.astype(str) + ')'
        return result


### Initialize `MissingDataHandler` and add path to data source.

In [119]:
mdh = MissingDataHandler('data.csv')

### Calculate mean and standard deviantion for all columns

In [120]:
mdh.calculate_statistics()

             mean       std
v1_miss  3.128788  1.213193
v2_miss  3.366412  1.144929
v3_miss  1.976562  1.090148
v4_miss  2.201550  1.134542
v5_miss  2.178862  1.293315


###. Replace missing data with the mean value for its corresponding column


In [121]:
mean_sub_df = mdh.mean_substitution()
# print all rows
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(mean_sub_df)

     v1_miss  v2_miss   v3_miss  v4_miss   v5_miss
0        4.0      3.0  1.000000  3.00000  1.000000
1        2.0      4.0  1.000000  2.20155  3.000000
2        3.0      3.0  3.000000  3.00000  2.178862
3        2.0      4.0  1.000000  2.00000  2.000000
4        2.0      4.0  2.000000  5.00000  5.000000
..       ...      ...       ...      ...       ...
144      4.0      4.0  4.000000  3.00000  2.000000
145      2.0      4.0  2.000000  1.00000  2.178862
146      4.0      2.0  3.000000  4.00000  1.000000
147      3.0      3.0  3.000000  3.00000  3.000000
148      3.0      3.0  1.976562  3.00000  4.000000

[149 rows x 5 columns]
      v1_miss   v2_miss   v3_miss  v4_miss   v5_miss
0    4.000000  3.000000  1.000000  3.00000  1.000000
1    2.000000  4.000000  1.000000  2.20155  3.000000
2    3.000000  3.000000  3.000000  3.00000  2.178862
3    2.000000  4.000000  1.000000  2.00000  2.000000
4    2.000000  4.000000  2.000000  5.00000  5.000000
5    4.000000  3.366412  1.000000  2.00000  1.

### Compare results to original values

In [122]:
original_df = pd.read_csv('data.csv')
mdh.compare_statistics(imputed_df=mean_sub_df, original_df=original_df)

Original:
              mean       std
v1_miss  3.128788  1.213193
v2_miss  3.366412  1.144929
v3_miss  1.976562  1.090148
v4_miss  2.201550  1.134542
v5_miss  2.178862  1.293315
New:
              mean       std
v1_miss  3.128788  1.141391
v2_miss  3.366412  1.073049
v3_miss  1.976562  1.009849
v4_miss  2.201550  1.055102
v5_miss  2.178862  1.174231


### Perform missing data imputation for column `v3_miss` using single regression

In [123]:
simple_imput_df = mdh.simple_regression()
# print all rows for only v3_miss
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(simple_imput_df['v3_miss'])

     v1_miss  v2_miss  v3_miss  v4_miss  v5_miss
0        4.0      3.0      1.0      3.0      1.0
1        2.0      4.0      1.0      NaN      3.0
2        3.0      3.0      3.0      3.0      NaN
3        2.0      4.0      1.0      2.0      2.0
4        2.0      4.0      2.0      5.0      5.0
..       ...      ...      ...      ...      ...
143      1.0      1.0      1.0      1.0      1.0
144      4.0      4.0      4.0      3.0      2.0
145      2.0      4.0      2.0      1.0      NaN
146      4.0      2.0      3.0      4.0      1.0
147      3.0      3.0      3.0      3.0      3.0

[128 rows x 5 columns]
0      1.0
1      1.0
2      3.0
3      1.0
4      2.0
5      1.0
6      3.0
7      3.0
8      2.0
9      1.0
10     1.0
11     1.0
12     3.0
13     1.0
14     1.0
16     4.0
17     2.0
18     2.0
19     1.0
20     1.0
21     2.0
22     1.0
23     1.0
24     5.0
25     1.0
26     3.0
27     1.0
28     4.0
29     3.0
30     1.0
33     1.0
34     1.0
35     3.0
36     1.0
37     1.0
39 

### Compare results to the original values

In [124]:
mdh.compare_statistics(imputed_df=simple_imput_df, original_df=original_df)

Original:
              mean       std
v1_miss  3.128788  1.213193
v2_miss  3.366412  1.144929
v3_miss  1.976562  1.090148
v4_miss  2.201550  1.134542
v5_miss  2.178862  1.293315
New:
              mean       std
v1_miss  3.077586  1.209749
v2_miss  3.379310  1.124086
v3_miss  1.976562  1.090148
v4_miss  2.210526  1.163407
v5_miss  2.109091  1.258698


### Perform missing data imputationwith multivariate regression 

In [125]:
multi_imput_df = mdh.multiple_imputation()
# print all rows
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(multi_imput_df)

     v1_miss  v2_miss   v3_miss   v4_miss   v5_miss
0        4.0      3.0  1.000000  3.000000  1.000000
1        2.0      4.0  1.000000  1.947759  3.000000
2        3.0      3.0  3.000000  3.000000  2.703470
3        2.0      4.0  1.000000  2.000000  2.000000
4        2.0      4.0  2.000000  5.000000  5.000000
..       ...      ...       ...       ...       ...
144      4.0      4.0  4.000000  3.000000  2.000000
145      2.0      4.0  2.000000  1.000000  1.964089
146      4.0      2.0  3.000000  4.000000  1.000000
147      3.0      3.0  3.000000  3.000000  3.000000
148      3.0      3.0  2.728263  3.000000  4.000000

[149 rows x 5 columns]
      v1_miss   v2_miss   v3_miss   v4_miss   v5_miss
0    4.000000  3.000000  1.000000  3.000000  1.000000
1    2.000000  4.000000  1.000000  1.947759  3.000000
2    3.000000  3.000000  3.000000  3.000000  2.703470
3    2.000000  4.000000  1.000000  2.000000  2.000000
4    2.000000  4.000000  2.000000  5.000000  5.000000
5    4.000000  3.355189  1.0

### Compare results to the original values

In [126]:
### compare stats
mdh.compare_statistics(imputed_df=multi_imput_df, original_df=original_df)

Original:
              mean       std
v1_miss  3.128788  1.213193
v2_miss  3.366412  1.144929
v3_miss  1.976562  1.090148
v4_miss  2.201550  1.134542
v5_miss  2.178862  1.293315
New:
              mean       std
v1_miss  3.127124  1.142051
v2_miss  3.365838  1.073265
v3_miss  1.993718  1.019845
v4_miss  2.200919  1.062462
v5_miss  2.195495  1.190645


### Correlation matrices

#### Original data

In [127]:
mdh.correlation_matrix(mdh.df)

Unnamed: 0,v1_miss,v2_miss,v3_miss,v4_miss,v5_miss
v1_miss,1.0 (p-value: 0.0 | missing: missing: 17),0.13 (p-value: 0.169 | missing: missing: 29),0.07 (p-value: 0.441 | missing: missing: 33),0.16 (p-value: 0.074 | missing: missing: 29),0.14 (p-value: 0.15 | missing: missing: 37)
v2_miss,0.13 (p-value: 0.169 | missing: missing: 29),1.0 (p-value: 0.0 | missing: missing: 18),0.14 (p-value: 0.141 | missing: missing: 33),0.18 (p-value: 0.058 | missing: missing: 32),0.02 (p-value: 0.838 | missing: missing: 36)
v3_miss,0.07 (p-value: 0.441 | missing: missing: 33),0.14 (p-value: 0.141 | missing: missing: 33),1.0 (p-value: 0.0 | missing: missing: 21),0.39 (p-value: 0.0 | missing: missing: 35),0.43 (p-value: 0.0 | missing: missing: 39)
v4_miss,0.16 (p-value: 0.074 | missing: missing: 29),0.18 (p-value: 0.058 | missing: missing: 32),0.39 (p-value: 0.0 | missing: missing: 35),1.0 (p-value: 0.0 | missing: missing: 20),0.22 (p-value: 0.023 | missing: missing: 38)
v5_miss,0.14 (p-value: 0.15 | missing: missing: 37),0.02 (p-value: 0.838 | missing: missing: 36),0.43 (p-value: 0.0 | missing: missing: 39),0.22 (p-value: 0.023 | missing: missing: 38),1.0 (p-value: 0.0 | missing: missing: 26)


#### Mean imputed

In [128]:
mdh.correlation_matrix(mdh.mean_imputed_df)

Unnamed: 0,v1_miss,v2_miss,v3_miss,v4_miss,v5_miss
v1_miss,1.0 (p-value: 0.0 | missing: missing: 0),0.11 (p-value: 0.166 | missing: missing: 0),0.06 (p-value: 0.44 | missing: missing: 0),0.15 (p-value: 0.063 | missing: missing: 0),0.13 (p-value: 0.127 | missing: missing: 0)
v2_miss,0.11 (p-value: 0.166 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.12 (p-value: 0.142 | missing: missing: 0),0.16 (p-value: 0.052 | missing: missing: 0),0.02 (p-value: 0.832 | missing: missing: 0)
v3_miss,0.06 (p-value: 0.44 | missing: missing: 0),0.12 (p-value: 0.142 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.36 (p-value: 0.0 | missing: missing: 0),0.36 (p-value: 0.0 | missing: missing: 0)
v4_miss,0.15 (p-value: 0.063 | missing: missing: 0),0.16 (p-value: 0.052 | missing: missing: 0),0.36 (p-value: 0.0 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.19 (p-value: 0.022 | missing: missing: 0)
v5_miss,0.13 (p-value: 0.127 | missing: missing: 0),0.02 (p-value: 0.832 | missing: missing: 0),0.36 (p-value: 0.0 | missing: missing: 0),0.19 (p-value: 0.022 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0)


#### Multivariate regression imputed

In [129]:
mdh.correlation_matrix(mdh.multi_imputed_df)

Unnamed: 0,v1_miss,v2_miss,v3_miss,v4_miss,v5_miss
v1_miss,1.0 (p-value: 0.0 | missing: missing: 0),0.12 (p-value: 0.137 | missing: missing: 0),0.1 (p-value: 0.235 | missing: missing: 0),0.17 (p-value: 0.035 | missing: missing: 0),0.14 (p-value: 0.094 | missing: missing: 0)
v2_miss,0.12 (p-value: 0.137 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.13 (p-value: 0.107 | missing: missing: 0),0.19 (p-value: 0.024 | missing: missing: 0),0.03 (p-value: 0.72 | missing: missing: 0)
v3_miss,0.1 (p-value: 0.235 | missing: missing: 0),0.13 (p-value: 0.107 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.39 (p-value: 0.0 | missing: missing: 0),0.46 (p-value: 0.0 | missing: missing: 0)
v4_miss,0.17 (p-value: 0.035 | missing: missing: 0),0.19 (p-value: 0.024 | missing: missing: 0),0.39 (p-value: 0.0 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0),0.24 (p-value: 0.003 | missing: missing: 0)
v5_miss,0.14 (p-value: 0.094 | missing: missing: 0),0.03 (p-value: 0.72 | missing: missing: 0),0.46 (p-value: 0.0 | missing: missing: 0),0.24 (p-value: 0.003 | missing: missing: 0),1.0 (p-value: 0.0 | missing: missing: 0)
