In [77]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
def get_type(string):
    parts = string.split("/")[1].split("_")

    # Get the information from the middle
    information = "_".join(parts[:-1])

    return information

def run_norm(path_train_data, type = "data/mean_"):
    # Read data from file
    print(path_train_data)
    
    data = np.genfromtxt(path_train_data, delimiter=',', skip_header=False)
    print(data)
    impute_type = get_type(type)


    if impute_type == "mean":
        # Impute missing values with column means
        col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs
        nan_indices = np.isnan(data)  # Find indices of missing values

        data[nan_indices] = np.take(col_means, np.where(nan_indices)[1])  



    elif impute_type == "mice":
        # Initialize the MICE imputer
        mice_imputer = IterativeImputer()
        # Impute missing values
        data = mice_imputer.fit_transform(data)

    elif impute_type == "zero":

        # Impute missing values with zeros
        data[np.isnan(data)] = 0

    
    means = np.mean(data, axis=0)
    # Calculate covariance matrix
    covariance_matrix = np.cov(data, rowvar=False)
    # Write parameters to files
    np.savetxt(type + "cov.txt", covariance_matrix, delimiter=',', fmt='%f')
    np.savetxt(type + "mu.txt", means, delimiter=',', fmt='%f')

In [83]:
data_list = ["banknote","climate_model_crashes","connectionist_bench_sonar","qsar_biodegradation","yeast"]
data_list = ['concrete_compression','wine_quality_red','wine_quality_white','yacht_hydrodynamics']
type_list =  ["diffuse","logistic","mar","mcar"]

In [84]:
for name in data_list:
    for type in type_list:
        dataname = name+"_"+type
        for imptype in ["mean","mice","zero"]:
            print(dataname)
            run_norm("{}/train_data.txt".format(dataname),"{}/{}_".format(dataname,imptype))
            # run_norm("data/train_data.txt","data/mice_")
            # run_norm("data/train_data.txt","data/zero_")

concrete_compression_diffuse
concrete_compression_diffuse/train_data.txt
[[ 190.68     nan  125.4  ... 1090.    804.01     nan]
 [ 139.9   132.6   103.3  ...  916.    753.4    28.  ]
 [ 155.       nan  143.   ...  877.    868.     28.  ]
 ...
 [ 272.8   181.9      nan ... 1012.4   714.3    28.  ]
 [ 145.       nan  134.   ...  979.    812.     28.  ]
 [ 167.     75.4   167.   ... 1007.3   770.1    28.  ]]
concrete_compression_diffuse
concrete_compression_diffuse/train_data.txt
[[ 190.68     nan  125.4  ... 1090.    804.01     nan]
 [ 139.9   132.6   103.3  ...  916.    753.4    28.  ]
 [ 155.       nan  143.   ...  877.    868.     28.  ]
 ...
 [ 272.8   181.9      nan ... 1012.4   714.3    28.  ]
 [ 145.       nan  134.   ...  979.    812.     28.  ]
 [ 167.     75.4   167.   ... 1007.3   770.1    28.  ]]
concrete_compression_diffuse
concrete_compression_diffuse/train_data.txt
[[ 190.68     nan  125.4  ... 1090.    804.01     nan]
 [ 139.9   132.6   103.3  ...  916.    753.4    28.  ]

  col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs


concrete_compression_mcar
concrete_compression_mcar/train_data.txt
[[   nan    0.   125.4 ...    nan    nan   14. ]
 [   nan    nan  103.3 ...    nan  753.4    nan]
 [ 155.     0.   143.  ...  877.   868.    28. ]
 ...
 [   nan  181.9    nan ... 1012.4  714.3   28. ]
 [   nan    0.   134.  ...    nan  812.    28. ]
 [ 167.    75.4    nan ...    nan    nan    nan]]
wine_quality_red_diffuse
wine_quality_red_diffuse/train_data.txt
[[7.20e+00 7.25e-01 5.00e-02 ... 3.41e+00 3.90e-01 1.09e+01]
 [8.70e+00 7.00e-01 2.40e-01 ... 3.32e+00 6.00e-01      nan]
 [1.02e+01 2.40e-01 4.90e-01 ... 3.14e+00 6.10e-01 1.04e+01]
 ...
 [1.11e+01 3.10e-01 4.90e-01 ... 3.12e+00 1.02e+00 1.06e+01]
 [1.19e+01 3.70e-01 6.90e-01 ... 3.00e+00 6.50e-01 1.28e+01]
 [7.30e+00 5.50e-01 1.00e-02 ... 3.35e+00 5.80e-01 1.10e+01]]
wine_quality_red_diffuse
wine_quality_red_diffuse/train_data.txt
[[7.20e+00 7.25e-01 5.00e-02 ... 3.41e+00 3.90e-01 1.09e+01]
 [8.70e+00 7.00e-01 2.40e-01 ... 3.32e+00 6.00e-01      nan]
 [1.02e+0

  col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs


wine_quality_red_logistic
wine_quality_red_logistic/train_data.txt
[[7.20e+00 7.25e-01 5.00e-02 ...      nan 3.90e-01 1.09e+01]
 [     nan 7.00e-01 2.40e-01 ... 3.32e+00 6.00e-01 9.00e+00]
 [1.02e+01 2.40e-01      nan ...      nan      nan 1.04e+01]
 ...
 [1.11e+01      nan 4.90e-01 ... 3.12e+00 1.02e+00 1.06e+01]
 [1.19e+01 3.70e-01 6.90e-01 ... 3.00e+00 6.50e-01      nan]
 [7.30e+00 5.50e-01 1.00e-02 ... 3.35e+00 5.80e-01      nan]]
wine_quality_red_mar
wine_quality_red_mar/train_data.txt
[[7.20e+00 7.25e-01 5.00e-02 ... 3.41e+00 3.90e-01      nan]
 [8.70e+00 7.00e-01 2.40e-01 ... 3.32e+00 6.00e-01      nan]
 [1.02e+01 2.40e-01 4.90e-01 ... 3.14e+00 6.10e-01      nan]
 ...
 [1.11e+01 3.10e-01 4.90e-01 ... 3.12e+00 1.02e+00      nan]
 [1.19e+01 3.70e-01 6.90e-01 ... 3.00e+00 6.50e-01      nan]
 [7.30e+00 5.50e-01 1.00e-02 ... 3.35e+00 5.80e-01      nan]]
wine_quality_red_mar
wine_quality_red_mar/train_data.txt
[[7.20e+00 7.25e-01 5.00e-02 ... 3.41e+00 3.90e-01      nan]
 [8.70e+00 7.0

  col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs


wine_quality_white_logistic
wine_quality_white_logistic/train_data.txt
[[ 7.3   0.18   nan ...  3.2   0.48 11.3 ]
 [ 7.7    nan  0.35 ...  3.11  0.42   nan]
 [  nan  0.24  0.46 ...   nan  0.62 11.9 ]
 ...
 [  nan   nan  0.42 ...  3.16  0.5   9.4 ]
 [ 6.9   0.34  0.49 ...  3.09  0.4    nan]
 [  nan  0.32  0.26 ...  3.07   nan  9.5 ]]
wine_quality_white_mar
wine_quality_white_mar/train_data.txt
[[ 7.3   0.18   nan ...   nan  0.48 11.3 ]
 [ 7.7   0.27   nan ...   nan  0.42 12.2 ]
 [ 6.7   0.24   nan ...   nan  0.62 11.9 ]
 ...
 [ 6.7   0.2    nan ...   nan  0.5   9.4 ]
 [ 6.9   0.34   nan ...   nan  0.4   9.  ]
 [ 7.5   0.32   nan ...   nan  0.38  9.5 ]]
wine_quality_white_mar
wine_quality_white_mar/train_data.txt
[[ 7.3   0.18   nan ...   nan  0.48 11.3 ]
 [ 7.7   0.27   nan ...   nan  0.42 12.2 ]
 [ 6.7   0.24   nan ...   nan  0.62 11.9 ]
 ...
 [ 6.7   0.2    nan ...   nan  0.5   9.4 ]
 [ 6.9   0.34   nan ...   nan  0.4   9.  ]
 [ 7.5   0.32   nan ...   nan  0.38  9.5 ]]
wine_quality_wh

  col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs


In [68]:
import pandas as pd

In [69]:
df_train_data = pd.read_csv('banknote_diffuse/train_data.txt')

# Display the first few rows of the dataframe
print(df_train_data.head())

  -5.441399999999999793e+00 7.236299999999999955e+00 1.093800000000000050e-01 nan
0  1.040000000000000036e+00 -6.932100000000000151...                             
1  1.787500000000000089e+00 4.780000000000000249e...                             
2  5.045200000000000351e+00 3.896399999999999864e...                             
3  -1.788599999999999968e+00 -6.34860000000000024...                             
4  -6.508399999999999963e+00 8.769600000000000506...                             


In [72]:

np.genfromtxt('banknote_diffuse/train_data.txt', delimiter=' ', skip_header=False)


array([[-5.4414  ,  7.2363  ,  0.10938 ,       nan],
       [ 1.04    , -6.9321  ,  8.2888  , -1.2991  ],
       [ 1.7875  ,  4.78    ,       nan, -3.2362  ],
       ...,
       [ 1.6426  ,  3.0149  ,  0.22849 , -0.147   ],
       [-2.0149  ,  3.6874  ,       nan,       nan],
       [-5.1661  ,  8.0433  ,  0.044265,       nan]])