In [11]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import os

In [14]:
def get_type(string):
    parts = string.split("/")[1].split("_")

    # Get the information from the middle
    information = "_".join(parts[:-1])

    return information

def run_norm(path_train_data, type = "data/mean_"):
    # Read data from file
    print(path_train_data)
    
    data = np.genfromtxt(path_train_data, delimiter=',', skip_header=False)
    print(data)
    impute_type = get_type(type)


    if impute_type == "mean":
        # Impute missing values with column means
        col_means = np.nanmean(data, axis=0)  # Compute column means while ignoring NaNs
        nan_indices = np.isnan(data)  # Find indices of missing values

        data[nan_indices] = np.take(col_means, np.where(nan_indices)[1])  



    elif impute_type == "mice":
        # Initialize the MICE imputer
        mice_imputer = IterativeImputer()
        # Impute missing values
        data = mice_imputer.fit_transform(data)

    elif impute_type == "zero":

        # Impute missing values with zeros
        data[np.isnan(data)] = 0

    
    means = np.mean(data, axis=0)
    # Calculate covariance matrix
    covariance_matrix = np.cov(data, rowvar=False)
    # Write parameters to files
    np.savetxt(type + "cov.txt", covariance_matrix, delimiter=',', fmt='%f')
    np.savetxt(type + "mu.txt", means, delimiter=',', fmt='%f')

In [13]:
clf_list = ["banknote","climate_model_crashes","connectionist_bench_sonar","qsar_biodegradation","yeast"]
reg_list = ['concrete_compression','wine_quality_red','wine_quality_white','yacht_hydrodynamics']
type_list =  ["diffuse","logistic","mar","mcar"]

In [10]:
# clf data
for name in clf_list:
    for type in type_list:
        dataname = name+"_"+type
        for imptype in ["mean","mice","zero"]:
            #print(name,type,imptype)
            path = "prefilled_data/{}/{}/{}".format(imptype,name,type)
            print(path)
            os.makedirs(path)

            run_norm("{}/train_data.txt".format(dataname),"{}/{}_".format(dataname,imptype))
            # # run_norm("data/train_data.txt","data/mice_")
            # # run_norm("data/train_data.txt","data/zero_")

prefilled_data/mean/banknote/diffuse
prefilled_data/mice/banknote/diffuse
prefilled_data/zero/banknote/diffuse
prefilled_data/mean/banknote/logistic
prefilled_data/mice/banknote/logistic
prefilled_data/zero/banknote/logistic
prefilled_data/mean/banknote/mar
prefilled_data/mice/banknote/mar
prefilled_data/zero/banknote/mar
prefilled_data/mean/banknote/mcar
prefilled_data/mice/banknote/mcar
prefilled_data/zero/banknote/mcar
prefilled_data/mean/climate_model_crashes/diffuse
prefilled_data/mice/climate_model_crashes/diffuse
prefilled_data/zero/climate_model_crashes/diffuse
prefilled_data/mean/climate_model_crashes/logistic
prefilled_data/mice/climate_model_crashes/logistic
prefilled_data/zero/climate_model_crashes/logistic
prefilled_data/mean/climate_model_crashes/mar
prefilled_data/mice/climate_model_crashes/mar
prefilled_data/zero/climate_model_crashes/mar
prefilled_data/mean/climate_model_crashes/mcar
prefilled_data/mice/climate_model_crashes/mcar
prefilled_data/zero/climate_model_crash

In [None]:
for name in data_list:
    for type in type_list:
        dataname = name+"_"+type
        for imptype in ["mean","mice","zero"]:
            print(dataname)
            run_norm("{}/train_data.txt".format(dataname),"{}/{}_".format(dataname,imptype))
            # run_norm("data/train_data.txt","data/mice_")
            # run_norm("data/train_data.txt","data/zero_")

In [68]:
import pandas as pd

In [69]:
df_train_data = pd.read_csv('banknote_diffuse/train_data.txt')

# Display the first few rows of the dataframe
print(df_train_data.head())

  -5.441399999999999793e+00 7.236299999999999955e+00 1.093800000000000050e-01 nan
0  1.040000000000000036e+00 -6.932100000000000151...                             
1  1.787500000000000089e+00 4.780000000000000249e...                             
2  5.045200000000000351e+00 3.896399999999999864e...                             
3  -1.788599999999999968e+00 -6.34860000000000024...                             
4  -6.508399999999999963e+00 8.769600000000000506...                             


In [72]:

np.genfromtxt('banknote_diffuse/train_data.txt', delimiter=' ', skip_header=False)


array([[-5.4414  ,  7.2363  ,  0.10938 ,       nan],
       [ 1.04    , -6.9321  ,  8.2888  , -1.2991  ],
       [ 1.7875  ,  4.78    ,       nan, -3.2362  ],
       ...,
       [ 1.6426  ,  3.0149  ,  0.22849 , -0.147   ],
       [-2.0149  ,  3.6874  ,       nan,       nan],
       [-5.1661  ,  8.0433  ,  0.044265,       nan]])