In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [12]:
def preproc_input_data(input_csv):
    data = pd.read_csv(input_csv, header=[0,1,2], index_col=0).drop(
        columns=['albumin ascites', 'albumin urine', 'creatinine ascites', 'creatinine body fluid', 'creatinine pleural', 'lymphocytes atypical csl', 'lymphocytes percent', 'lymphocytes pleural']
    )
    means = []
    changes = []
    counts = []
    
    mean_cols = []
    change_cols = []
    count_cols = []
    
    for col in tqdm(data.columns.get_level_values(0).unique()):
        # Interpolate mean values from closest cells with mask = 1
        # Forward fill and back fill remaining nulls
        buffer_df = data.loc[:, (col, 'mask')] \
                                        .replace({0: np.nan}) \
                                        .multiply(data.loc[:, (col, 'mean')]) \
                                        .interpolate(axis=1, limit_area='inside') \
                                        .fillna(method='ffill', axis=1) \
                                        .fillna(method='bfill', axis=1) \
                                        .fillna(data.loc[:, (col, 'mean')])
        
        # Calculate mean, change, count
        means.append(buffer_df.mean(axis=1))
        changes.append((buffer_df.iloc[:, -1] - buffer_df.iloc[:, 0]).divide(buffer_df.iloc[:, 0]))
        counts.append(data.loc[:, (col, 'mask')].sum(axis=1))
        
        mean_cols.append(col + ';mean')
        change_cols.append(col + ';change')
        count_cols.append(col + ';count')

    cleaned_data = pd.concat(means + changes + counts, axis=1)
    cleaned_data.columns = mean_cols + change_cols + count_cols
    cleaned_data.index = data.index
    
    return cleaned_data

In [13]:
clean_train_data = preproc_input_data('X_train.csv')
clean_train_data.to_csv('X_train_cc.csv') 

100%|███████████████████████████████████████████████████████████████████████████████████| 96/96 [01:08<00:00,  1.40it/s]


In [14]:
clean_valid_data = preproc_input_data('X_valid.csv')
clean_valid_data.to_csv('X_valid_cc.csv') 

100%|███████████████████████████████████████████████████████████████████████████████████| 96/96 [00:14<00:00,  6.47it/s]


In [15]:
clean_test_data = preproc_input_data('X_test.csv')
clean_test_data.to_csv('X_test_cc.csv') 

100%|███████████████████████████████████████████████████████████████████████████████████| 96/96 [00:16<00:00,  5.67it/s]
