In [2]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Step 1: Generate synthetic data
np.random.seed(42)
n_samples = 100
data = {
    'Age': np.random.randint(20, 60, n_samples),
    'Salary': np.random.normal(50000, 12000, n_samples),
    'Gender': np.random.choice(['Male', 'Female'], n_samples)
}
df = pd.DataFrame(data)

# Step 2: Introduce missing data
missing_rate = 0.1
for col in df.columns:
    n_missing = int(missing_rate * n_samples)
    missing_indices = np.random.choice(df.index, n_missing, replace=False)
    df.loc[missing_indices, col] = np.nan

# Step 3: Impute missing data
# Encode categorical data
encoder = LabelEncoder()
encoded_genders = encoder.fit_transform(df['Gender'].astype(str))
df['Gender'] = encoded_genders

# Using Iterative Imputer for a more robust imputation
imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=42, max_iter=10)
imputed_numerical = imputer.fit_transform(df[['Age', 'Salary']])

# Using a simple imputer for the categorical 'Gender' as iterative imputer might not directly apply
simple_imputer = SimpleImputer(strategy='most_frequent')
imputed_genders = simple_imputer.fit_transform(df[['Gender']])

# Combine imputed data
df_imputed = pd.DataFrame(np.column_stack([imputed_numerical, imputed_genders]), columns=df.columns)
df_imputed['Gender'] = encoder.inverse_transform(df_imputed['Gender'].astype(int))

# Step 4: Evaluate imputation
# For simplicity, assume we somehow know original data (this part is hypothetical and just for practice)
original_data = pd.DataFrame(data)




# Calculate MSE for numerical data
mse_age = mean_squared_error(original_data['Age'], df_imputed['Age'])
mse_salary = mean_squared_error(original_data['Salary'], df_imputed['Salary'])

# Calculate accuracy for categorical data
accuracy_gender = accuracy_score(original_data['Gender'], df_imputed['Gender'])

# Display results
print(f"Mean Squared Error for Age: {mse_age}")
print(f"Mean Squared Error for Salary: {mse_salary}")
print(f"Accuracy for Gender: {accuracy_gender:.2f}")


Mean Squared Error for Age: 18.620408
Mean Squared Error for Salary: 14038580.899435747
Accuracy for Gender: 0.90




In [None]:
Traceback (most recent call last):
  File "experiment.py", line 1046, in <module>
    data_preparation_function_object = CURRENT_SUPPORTED_DATALOADERS[dataset]
  File "experiment.py", line 997, in run
    
  File "experiment.py", line 766, in run_custom_experiments
    imputed_X_train_df = pd.DataFrame(experiment.imputed_X_train, columns=data_copy_X.columns).reset_index(drop=True)
  File "/Users/dylandominguez/StudioProjects/M-DEW/.venv/lib/python3.8/site-packages/pandas/core/frame.py", line 758, in __init__
    mgr = ndarray_to_mgr(
  File "/Users/dylandominguez/StudioProjects/M-DEW/.venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 337, in ndarray_to_mgr
    _check_values_indices_shape_match(values, index, columns)
  File "/Users/dylandominguez/StudioProjects/M-DEW/.venv/lib/python3.8/site-packages/pandas/core/internals/construction.py", line 408, in _check_values_indices_shape_match
    raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (119, 13), indices imply (119, 14)

In [None]:
#below is the code that was used before. not working

with tqdm(total=len(miss_param_grid)) as pbar:
        for i, params in enumerate(miss_param_grid):
            print(f"\nStarting experiment with params: {params}")  # Print current experiment parameters
            data_copy = deepcopy(data)
            params = {
                k: p
                for k, p in zip(list(miss_param_dict.keys()), params)
            }
            param_lookup_dict[i] = params
            
            # imputation
            dataset = MissDataset(
                data=data_copy,
                target_col=target_col,
                n_folds=5,
                **params,
            )
            if dataset_name == 'parkinsons':
                dataset.split_dataset_hook(
                    splitting_function=split_parkinsons_data, df=dataset.data, n_folds=5
                )
            if task_type == 'classification':
                experiment = CustomClassificationExperiment(
                    dataset=dataset, dataset_name=dataset_name,
                    exp_type=params['missing_mechanism'],
                    name=name
                )
            else:
                experiment = CustomRegressionExperiment(
                    dataset=dataset, dataset_name=dataset_name,
                    exp_type=params['missing_mechanism'],
                    name=name
                )
            metrics_df, errors_df, weights_dfs, preds_df, distances_df = experiment.run()
            ##############################
            


            # # Evaluate imputation quality
            # for col in data_copy.columns:
            #     if col != target_col:  # Skip the target column for imputation evaluation
            #         if data_copy[col].dtype.kind in 'iufc':  # integer, unsigned integer, float, complex considered as continuous
            #             original_data = data_copy[col].dropna()
            #             imputed_data = dataset.data.loc[original_data.index, col]
            #             rmse = np.sqrt(mean_squared_error(original_data, imputed_data))
            #             imputation_eval_results.append({"column": col, "metric": "RMSE", "value": rmse})
            #             print(f"RMSE for {col} with params {params}: {rmse}")
            #         else:  # Assume non-numeric data types are categorical
            #             original_data = data_copy[col].dropna()
            #             imputed_data = dataset.data.loc[original_data.index, col]
            #             if len(set(original_data)) > 1:  # ROC AUC requires at least two classes
            #                 auroc = roc_auc_score(original_data.astype('category').cat.codes, imputed_data.astype('category').cat.codes)
            #                 imputation_eval_results.append({"column": col, "metric": "AUROC", "value": auroc})
            #                 print(f"AUROC for {col} with params {params}: {auroc}")


    #         # Collect imputed data and evaluate
    #         data_copy_X_before = data_copy.drop(columns=[target_col]).reset_index(drop=True)
    #         X = data_copy.drop(columns=[target_col])
    #         scaler = MinMaxScaler(feature_range=(0, 1))
    #         data_copy_X = scaler.fit_transform(X)
    #         data_copy_X= pd.DataFrame(data_copy_X, columns=data_copy_X_before.columns).reset_index(drop=True)

    #         # imputed_X_train_df = pd.DataFrame(experiment.imputed_X_train, columns=data_copy_X.columns).reset_index(drop=True)
    #         # imputed_X_test_df = pd.DataFrame(pipeline.last_imputed_X_test, columns=data_copy_X.columns).reset_index(drop=True)
    #         # # imputed_X_val_df = pd.DataFrame(experiment.imputed_X_val, columns=data_copy_X.columns).reset_index(drop=True)
    #         # data_copy_X = experiment.dataset.data.drop(columns=[target_col]).reset_index(drop=True)
            
            
    #         for pipeline_name, pipeline in experiment.pipelines.items():
    #             print("Original Data:")
    #             print(data_copy_X)


    #             # print(type(experiment.imputed_X_val))
    #             # print(experiment.imputed_X_val.shape)

    #                 # Impute the data and directly convert it to a DataFrame
    #             imputed_array = pipeline.imputer.fit_transform(data_copy_X)
    #             imputed_data_df = pd.DataFrame(imputed_array, columns=data_copy_X.columns)
                    
                
                

                
                



                

    #             # imputed_data_df = pd.DataFrame(pipeline.last_imputed_X_, columns=data_copy_X.columns).reset_index(drop=True)
    #             print("Imputed Data:")
    #             print(imputed_data_df)
    #             print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    #             print(data_copy_X.isnull().sum())
    #             print("This is the imputed data missing values:")
    #             print(imputed_data_df.isnull().sum())

    #             print(f"\nRunning pipeline: {pipeline_name}")
    #             for col in data_copy_X.columns:
    #                 if col != target_col:  # Skip target column evaluation
    #                     original_col_data = data_copy_X[col].dropna()
    #                     imputed_col_data = imputed_data_df[col].loc[original_col_data.index]
    #                     if np.issubdtype(data_copy_X[col].dtype, np.number):
    #                         rmse = np.sqrt(mean_squared_error(original_col_data, imputed_col_data))
    #                         result = {'pipeline': pipeline_name, 'feature': col, 'metric': 'RMSE', 'value': rmse}
    #                     else:
    #                         le = LabelEncoder()
    #                         le.fit(np.concatenate([original_col_data, imputed_col_data]))
    #                         encoded_original = le.transform(original_col_data)
    #                         encoded_imputed = le.transform(imputed_col_data)
    #                         auroc = roc_auc_score(encoded_original, encoded_imputed)
    #                         result = {'pipeline': pipeline_name, 'feature': col, 'metric': 'AUROC', 'value': auroc}
    #                     imputation_eval_results.append(result)



    #         # Save imputation evaluation results per pipeline to CSV dylan
    #         imputation_eval_df = pd.DataFrame(imputation_eval_results)
    #         imputation_eval_filename = os.path.join(experiment.results_dir, f'imputation_eval_results_{i}.csv')
    #         imputation_eval_df.to_csv(imputation_eval_filename, index=False)
            
           






    #         filename = str(i) + '.csv'
    #         errors_filename = os.path.join(experiment.results_dir, 'errors_' + filename)
    #         errors_df.to_csv(errors_filename)

    #         metrics_dfs.append(metrics_df)
    #         metrics_filename = os.path.join(experiment.results_dir, 'metrics_' + filename)
    #         metrics_df.to_csv(metrics_filename)

    #         preds_filename = os.path.join(experiment.results_dir, 'predictions_' + filename)
    #         preds_df.to_csv(preds_filename)

    #         for top_n in weights_dfs.keys():
    #             weights_filename = os.path.join(experiment.results_dir, 'weights_top_' + str(top_n) + '_' + filename)
    #             weights_dfs[top_n].to_csv(weights_filename)

    #         distances_filename = os.path.join(experiment.results_dir, 'distances_' + filename)
    #         distances_df.to_csv(distances_filename)
    #         print('updating progress bar after index ' + str(i))
    #         pbar.update(1)

    # # Save final imputation evaluation results for all pipelines dylan
    # final_imputation_eval_df = pd.DataFrame(imputation_eval_results)  # NEW: Aggregated results
    # final_imputation_eval_filename = os.path.join(experiment.results_dir, 'final_imputation_eval_results.csv')  # NEW: Filename for aggregated results
    # final_imputation_eval_df.to_csv(final_imputation_eval_filename, index=False)  # NEW: Save aggregated results


    final_results = pd.concat(metrics_dfs)
    # print(f"Metrics {metrics_dfs}")
    final_results.to_csv(os.path.join(experiment.base_dir, 'final_results.csv'))

    param_lookup_dict_json = json.dumps(param_lookup_dict)
    with open(os.path.join(experiment.base_dir, 'params_lookup.json'), 'w') as f:
        f.write(param_lookup_dict_json)

In [None]:
def _run_one_pipeline(self, pipeline, pipeline_name, X_train, y_train, X_test, y_test):
        pipeline.fit(X_train, y_train)
        proba_predictions = pipeline.predict_proba(X_test)[:, 1]
        if isinstance(proba_predictions, list):
            proba_predictions = proba_predictions[0]

        y_test_2d = np.array(self.label_enc.transform(y_test.reshape(-1, 1)).todense())
        errors = np.abs(y_test - proba_predictions)
        predictions = np.round(proba_predictions)
        single_label_y_test = np.argmax(y_test_2d, axis=1)
        roc_auc = roc_auc_score(y_test, proba_predictions)

        metrics = {}
        metrics['roc_auc'] = round(roc_auc, 4)
        accuracy = 1 - (np.sum(np.logical_xor(predictions, single_label_y_test)) / len(predictions))
        metrics['accuracy'] = round(accuracy, 4)
        metrics['f1_score'] = f1_score(single_label_y_test, predictions)
        self.metrics[pipeline_name].append(list(metrics.values()))

        return proba_predictions, errors

In [None]:

    def _run_one_pipeline(self, pipeline, pipeline_name, X_train, y_train, X_test, y_test):
        
        print(pipeline_name)
        if pipeline_name !="<class 'sklearn.ensemble._stacking.StackingClassifier'>":
            ####This is the problem 


            # print("X_train shape")
            # print(X_train.shape)
            # print("X_test shape")
            # print(X_test.shape)


            pipeline.fit(X_train, y_train)
            X_train_imputed=pipeline.X_train_imputed
            # print("X_imputed")
            # print(X_imputed)
            
            proba_predictions = pipeline.predict_proba(X_test)[:, 1]
            X_test_imputed=pipeline.X_test_imputed

            # print("Train not missing")
            # print(self.train_not_missing.shape)
            # print("X_train_imputed")
            # print(X_train_imputed.shape)



            # # print("Test not missing")
            # # print(self.test_not_missing.shape)
            # print("X_test_imputed")
            # print(X_test_imputed.shape)


            X_train_imputed=pd.DataFrame(X_train_imputed, columns=self.train_not_missing.columns, index=self.train_not_missing.index)
            X_test_imputed=pd.DataFrame(X_test_imputed, columns=self.test_not_missing.columns, index=self.test_not_missing.index)



            

            if isinstance(proba_predictions, list):
                proba_predictions = proba_predictions[0]


            ###################
            # Dylan added Combine imputed training and testing data for evaluation
            X_combined_imputed = np.vstack((X_train_imputed, X_test_imputed))
            X_combined_original = pd.concat([self.train_not_missing, self.test_not_missing]).reset_index(drop=True)

            # Create masks for original missing data locations in train and test
            missing_mask_train = X_train.isnull()
            missing_mask_test = X_test.isnull()
            missing_mask_combined = pd.concat([missing_mask_train, missing_mask_test]).reset_index(drop=True)

            imputed_RMSE={}
            imputed_roc_auc={}

            # Evaluate imputation on numeric and categorical data separately
            for col in X_combined_original.columns:
                if X_combined_original[col].dtype.kind in 'iuf':
                    # Numeric evaluation: RMSE
                    # print("Numeric evaluation: RMSE")
                    original_data = X_combined_original[col][missing_mask_combined[col]]
                    imputed_data = X_combined_imputed[:, X_combined_original.columns.get_loc(col)][missing_mask_combined[col]]
                    if not original_data.empty:
                        
                        rmse = np.sqrt(mean_squared_error(original_data, imputed_data))
                        
                        imputed_RMSE[col] = rmse
                else:
                    # Categorical evaluation: ROC AUC
                    # print("Categorical evaluation: ROC AUC")
                    original_data = X_combined_original[col][missing_mask_combined[col]].dropna()
                    imputed_data = X_combined_imputed[:, X_combined_original.columns.get_loc(col)][missing_mask_combined[col]]
                    imputed_data = pd.Series(imputed_data).dropna()
                    if not original_data.empty and len(original_data.unique()) > 1:
                        roc_auc = roc_auc_score(original_data.astype('category').cat.codes, imputed_data.astype('category').cat.codes)
                        imputed_roc_auc[col] = roc_auc

            ###################
            # Dylan added imputed evaluations
            # imputed_evals = {}
            # imputed_evals['RMSE'] = round(imputed_RMSE, 4)
            # imputed_evals['AUC_ROC'] = round(imputed_roc_auc, 4)
            # self.imputed_evals[pipeline_name].append(list(imputed_evals.values()))

            # Compile imputed data evaluations
            imputed_evals = {
                'RMSE': imputed_RMSE,
                'AUC_ROC': imputed_roc_auc
            }
            if not hasattr(self, 'imputed_evals'):
                self.imputed_evals = {}
            if pipeline_name not in self.imputed_evals:
                self.imputed_evals[pipeline_name] = []
            self.imputed_evals[pipeline_name].append(imputed_evals)
            # print(self.imputed_evals)

            ###################


            ####################
        else:
            pipeline.fit(X_train, y_train)
            proba_predictions = pipeline.predict_proba(X_test)[:, 1]
            if isinstance(proba_predictions, list):
                proba_predictions = proba_predictions[0]
                

        y_test_2d = np.array(self.label_enc.transform(y_test.reshape(-1, 1)).todense())
        errors = np.abs(y_test - proba_predictions)
        predictions = np.round(proba_predictions)
        single_label_y_test = np.argmax(y_test_2d, axis=1)
        roc_auc = roc_auc_score(y_test, proba_predictions)

        

        metrics = {}
        metrics['roc_auc'] = round(roc_auc, 4)
        accuracy = 1 - (np.sum(np.logical_xor(predictions, single_label_y_test)) / len(predictions))
        metrics['accuracy'] = round(accuracy, 4)
        metrics['f1_score'] = f1_score(single_label_y_test, predictions)
        self.metrics[pipeline_name].append(list(metrics.values()))


        



        return proba_predictions, errors
