In [3]:
import numpy as np
from numba import jit
import time

### Update the proximity matrix with the similarity of the samples' leaf indices

In [4]:
@jit(parallel=True)
def Comparison(arr1, arr2):
    comparison = arr1 == arr2
    return np.sum(comparison)
    
    
def Proximity_Matrix(pm, leaf_idx):
    '''pm: initial proximity matrix with the values of 0
       leaf_idx: the leaf index of the samples in the Random Forest'''
    n_samples=leaf_idx.shape[0]
    
    #Update the matrix
    start=time.time()
    for i in range(n_samples-1):
        for j in range(i+1, n_samples):

            #Compare the similarity between the samples in the leaf index
            adds = Comparison(leaf_idx[i], leaf_idx[j])

            pm[i][j] += adds
            pm[j][i] += adds
            
        #Demonstrate the progress    
        if i%200==0:
            end=time.time()
            
            n_total = n_samples*(n_samples-1)/2
            n_elapsed = (n_samples-1+n_samples-1-(i+1))*(i+1)/2
            
            time_elapsed = (end-start)//60
            print("PM progress... {}-th 200-sample is done! Elapsed time {} mins\n\
            Rest time: {} mins".format((i+1)//200, time_elapsed, (time_elapsed/n_elapsed)*n_total//60+1))

### Update the initial guess of the missing values

In [None]:
#Get the value frequency in DF
@jit
def Get_Value_Frequency(value, arr):
    #Find the number of the desired values in the column
    idx = np.argwhere(arr==value)
    
    #The sum of the desired value counts
    value_sum = arr[idx].size
    
    return value_sum/arr.size


#Caluculate weight frequency
@jit(parallel=True)
def Weight_Frequency(pm, row_id, P_value, F_value):
    #Proximity of all the values
    P_values = pm[row_id].sum()
    W_value = P_value/P_values

    #Weight Frequency of the value
    return F_value*W_value


#4. Calculate the final guess
def Refine_Guess(row_id, column_id, df, pm, nominal):
    #column_id: the column needed to evaluate
    #row_id: the sample which have missing values in the column
    #df: the original dataset with missing values
    #pm:the proximity matrix
    
    #All the unique values in the original column
    arr_unique = np.unique(df[:,column_id])
    values = arr_unique[~np.isnan(arr_unique)]
    
    #For nominal feature, use weight frequency
    #For numeric feature, use weight average of the values
    WF_values=[]
    VA_values=0
    for value in tqdm(values):
        #Frequency of the value in the column
        F_value = Get_Value_Frequency(value=value, arr=df[:,column_id])
    
        #Weight in proximity matrix
            #Proximity of the value
        P_value=0
            
        for v in range(len(df[:,column_id])):
            if df[:,column_id][v] == value:
                #Get the proximity from PM by row_id and column id
                P_value+=pm[row_id][v]
            
            #Proximity of all the values
        WF_values.append(Weight_Frequency(pm, row_id, P_value, F_value))
        
            #Weight Average of the value
        VA_values+=value*W_value

    #Find the highest WF_value's index for nominal feature
    idx = np.argmax(WF_values)
    
    #Return the final guessed value
    if nominal==True:
        return values[idx]
    else:
        return VA_values

In [6]:
#5. Repeat 1~ 4 steps
#Unitl convergence between current guesses and the last guesses
#or difference below some tolerance
def Reat_Until_Converge(df, pm, max_iters, toler):
    for itera in range(max_iters):
        #Define a tempo array for saving the values of the dataframe with initial guesses
        df_rf = df.copy().to_numpy()

        values_last_guessed = []
        values_current_guess = []

        for row_id in tqdm(list(idex_nans.keys())):
            for column_id in idex_nans[int(row_id)]:
                #Get the last guess
                value_last_guessed = df_rf[int(row_id), column_id]
                values_last_guessed.append(value_last_guessed)

                #Get the next guess
                value_current_guess = Refine_Guess(int(row_id), column_id, df_rf, pm, nominal=True)
                values_current_guess.append(value_current_guess)

                #Change the dataset according to current guess
                df_rf[int(row_id), column_id] = value_current_guess

        values_last_guessed=np.array(values_last_guessed)
        values_current_guess=np.array(values_current_guess)
        

        error = np.sum(np.abs(values_last_guessed-values_current_guess))/values_last_guessed.shape[0]
        print("Iteration: ", itera, "  error: ", error)
               
        if error <= toler:
            return df_rf, pm, error
        
    #Return pm, df
    return df_rf, error