In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import  cpu_count, Pool

In [2]:
class EM:
    def __init__(self,Y_M, Y_N = None, Z_N = None, sigma_e_init = 0, W_init = 0 ):
        self.Y_N = Y_N
        self.Y_M = Y_M
        self.Z_N = Z_N
        self.Z_M = None
        
        
        self.M = len(Y_M)
        self.N = len(Y_N)  if type(self.Y_N) != type(None) else 0
        self.S = self.M + self.N
        
        self.sigma_e = sigma_e_init
        self.W = W_init
        
        self.EZ = None
        self.EZZ = None
        
        
    def e_step(self):
        A = (self.W**2) * (self.sigma_e**-2) + 1
        B = self.W*self.Y_M*(self.sigma_e**-2)
        
        self.EZ = B/A
        self.EZZ = (1/A)  + (B/A)**2
        
    def m_step(self):
        # Must update W first, as it is required for sigma
        self.W = self.new_W()
        self.sigma_e = self.new_sigma_e()
        
        
    def new_sigma_e(self):
        _1 = np.sum((self.Y_N-W*self.Z_N)**2) if type(self.Y_N) != type(None) else 0
        _2 = np.sum(self.Y_M**2)
        _3 = -2*self.W*np.sum(self.Y_M*self.EZ)
        _4 = self.W**2 * np.sum(self.EZZ)

        return np.sqrt((_1 + _2 + _3 + _4)/self.S)
    
    
    def new_W(self):
        _1 = np.sum(self.Y_N*self.Z_N)  if type(self.Y_N) != type(None) else 0
        _2 = np.sum(self.Y_M*self.EZ)
        _3 = np.sum(self.Z_N**2) if type(self.Y_N) != type(None) else 0
        _4 = np.sum(self.EZZ)
        return (_1 + _2)/(_3 + _4)

        

In [3]:
def peak_selector(arr, bins):
    counts, bins, bars = plt.hist(arr,bins=bins)
    #plt.show()
    plt.close()
    idx = np.argmax(counts)
    return np.mean([bins[idx], bins[idx+1]])

def param_selection(sigma_e_arr, W_arr, bins = 50):
    return peak_selector(sigma_e_arr, bins), peak_selector(W_arr, bins)

In [4]:
def run_EM(data, reps = 1000, max_iter_ = 10000):
    # Max_iter is for each rep
    ## ALL data missing
    # Since data has been standardised, noise values should be expected to be small. Hence we initialise as follows:

    sigma_e_arr = []
    w_arr = []


    for i in range(reps):
        sigma_init = np.random.uniform(0,0.1)
        W_init = np.random.uniform(-10,10)

        EM1 = EM(data, sigma_e_init = sigma_init, W_init = W_init)

        prev_sigma_e = EM1.sigma_e
        prev_W = EM1.W
        not_converged = True
        i = 0

        while not_converged and i < max_iter_:
            EM1.e_step()
            EM1.m_step()
            i += 1

            # If a paramter value becomes nan, we set the converged value to the previous known real number
            if np.isnan(EM1.sigma_e) or np.isnan(EM1.W):
                EM1.sigma_e = prev_sigma_e
                EM1.W = prev_W
                not_converged = False
            
            # To avoid divide by 0
            if EM1.sigma_e == 0:
                EM1.sigma_e += 0.0000000001
            
            # Check for convergence
            if abs(prev_sigma_e - EM1.sigma_e) < 0.0001 and abs(prev_W - EM1.W) < 0.0001:
                not_converged = False
            else:
                prev_sigma_e = EM1.sigma_e
                prev_W = EM1.W
            
        

        sigma_e_arr.append(EM1.sigma_e)
        w_arr.append(EM1.W)

    # Selection method selects most common param value however, the assumption that this is equivalent to the value of highest ll is untrue.
    selected_sigma_e, selected_w = param_selection(sigma_e_arr, w_arr)

    EM2 = EM(data, sigma_e_init = selected_sigma_e, W_init = selected_w)
    EM2.e_step()

    return selected_w * EM2.EZ


In [5]:
# Get datasets
data = pd.read_csv("../../Data/X_train_0_WELL.csv", sep=',')
data_copy = pd.read_csv("../../Data/X_train_0_WELL.csv", sep=',')
X_test = pd.read_csv("../../Data/X_test_0_WELL.csv", sep=',')
X_test_copy = pd.read_csv("../../Data/X_test_0_WELL.csv", sep=',')

In [6]:
logs = ['SP', 'DTS', 'DTC', 'NPHI', 'PEF', 'GR', 'RHOB', 'CALI', 'DCAL', 'SGR']
Numerical = ['CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF','DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC','ROPA', 'RXO']

In [7]:
# Get expected values after removing noise for training set and assign to dataframe
def replace_data(well):
    return_list = []
    for feature in Numerical:
        return_list.append(run_EM(data[data['WELL'] == well][feature].to_numpy()))
    return return_list
        
# Get expected values after removing noise for testing set and assign to dataframe
def replace_test(well):
    return_list = []
    for feature in Numerical:
         return_list.append(run_EM(X_test[X_test['WELL'] == well][feature].to_numpy()))
    return return_list

In [None]:
with Pool(processes=48) as pool:
    rl = pool.map(replace_data, range(1,99))

for well in range(1,99):
    for feature in range(len(Numerical)):
        data.loc[data['WELL'] == well, Numerical[feature]] = rl[well-1][feature]

In [None]:
data == data_copy

In [None]:
# Push to CSV
data.to_csv("../../Data/Noise_free/X_train_noise_free_linear_3.csv", index = False)

In [None]:
with Pool(processes=48) as pool:
    rl2 = pool.map(replace_test, range(99,109))

for well in range(99,109):
    for feature in range(len(Numerical)):
        X_test.loc[X_test['WELL'] == well, Numerical[feature]] = rl2[well-99][feature]

In [None]:
X_test == X_test_copy

In [None]:
# Push to CSV
X_test.to_csv("../../Data/Noise_free/X_test_noise_free_linear_3.csv", index = False)

In [None]:
cpu_count()

* **** Still need to optimise param selection method

* create new dataframe with f(Z) - done
* Retrain NN with orignial dataset
* train NN with new dataset