In [1]:
import pandas as pd
import numpy as np
import utils2

In [2]:
# Get datasets
data = pd.read_csv("../../Data/Original/train.csv", sep=';')
X_test = pd.read_csv("../../Data/Original/test.csv", sep=';')
y_test = pd.read_csv("../../Data/Original/test_labels.csv", sep=';')

In [3]:
# encode categorical data
for feature in ["GROUP", "FORMATION"]:
    if feature in data.columns:
        unique_list = data[feature].unique().tolist()
        feature_enc = {x:unique_list.index(x)+1 for x in unique_list}
        
        data[feature] = data[feature].map(feature_enc)
        
        
        for df in [X_test]:
            updated_list = unique_list
            df_list = df[feature].unique().tolist()

            for i in df_list:
                if i not in updated_list:
                    updated_list.append(i)
                    
            enc = {x:updated_list.index(x)+1 for x in updated_list}
            df[feature] = df[feature].map(enc)

In [4]:
# Get standardising values to ensure same values are applied to all datasets
    
for feature in utils2.Numerical:
    mean = data[feature].mean()
    std = data[feature].std()
    
    for d in [data,X_test]:
        d[feature] =  (d[feature] - mean)/std
        

In [5]:
features = utils2.Numerical + ["GROUP", "FORMATION"]

data = data[features]
X_test = X_test[features]

In [6]:
from sklearn.impute import KNNImputer

In [7]:
def KNN_impute(data, X_test, n):
    imputer = KNNImputer(n_neighbors=n)
    print('Imputing Training Data')
    
    data = pd.DataFrame(imputer.fit_transform(data),columns = data.columns)
    data.to_csv(f'../../Data/Missing_KNN/X_train_{n}.csv', index = False)
    
    print('Training Data Imputed and Saved')
    
    print('Imputing Testing Data')
    X_test = pd.DataFrame(imputer.fit_transform(X_test),columns = X_test.columns)
    X_test.to_csv(f'../../Data/Missing_KNN/X_test_{n}.csv', index = False)
    
    print('Test Data Imputed and Saved')

In [8]:
# 100 Neighbours
KNN_impute(data, X_test,100)

Imputing Training Data
Training Data Imputed and Saved
Imputing Testing Data


KeyboardInterrupt: 

In [10]:
data = pd.read_csv("../../Data/Missing_KNN/X_train_100.csv")
data

Unnamed: 0,DEPTH_MD,X_LOC,Y_LOC,Z_LOC,CALI,RSHA,RMED,RDEP,RHOB,GR,...,ROP,DTS,DCAL,DRHO,MUDWEIGHT,RMIC,ROPA,RXO,GROUP,FORMATION
0,-1.694333,-1.388717,-1.641038,1.718974,1.657097,-0.094747,-0.061741,-0.078039,-1.582420,0.271304,...,-0.066736,1.071730,-0.014812,-0.078516,-0.022056,-0.079028,0.332910,-0.239531,1.0,1.0
1,-1.694181,-1.388717,-1.641038,1.718818,1.653929,-0.094783,-0.061620,-0.078065,-1.560280,0.243904,...,-0.066736,1.071730,-0.016872,-0.077882,-0.022056,-0.079045,0.333517,-0.182187,1.0,1.0
2,-1.694029,-1.388717,-1.641038,1.718661,1.653929,-0.094440,-0.061466,-0.078021,-1.533712,0.114173,...,-0.066643,1.069200,-0.029923,-0.078424,-0.020222,-0.079110,0.332117,-0.383235,1.0,1.0
3,-1.693876,-1.388717,-1.641038,1.718505,1.651424,-0.094510,-0.061555,-0.078014,-1.551913,0.057410,...,-0.063274,1.061473,-0.036321,-0.080038,-0.016555,-0.079117,0.331948,-0.956743,1.0,1.0
4,-1.693724,-1.388717,-1.641038,1.718348,1.649796,-0.094259,-0.061901,-0.078068,-1.598812,0.023821,...,-0.051894,1.060954,-0.044475,-0.081590,-0.017472,-0.079117,0.335928,-0.928125,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1170506,0.988010,-1.156206,-2.014439,-0.946521,-1.253652,-0.081365,-0.045408,-0.072452,0.959387,0.196930,...,-0.071258,-0.919119,-0.021210,-0.001867,0.019023,-0.072100,0.140002,0.280409,7.0,30.0
1170507,0.988162,-1.155112,-2.013838,-0.945703,-1.265215,-0.081305,-0.045314,-0.072417,0.997405,0.130004,...,-0.071031,-0.920976,-0.021213,-0.002647,0.019023,-0.072180,0.147386,0.280252,7.0,30.0
1170508,0.988314,-1.155112,-2.013838,-0.946317,-1.272847,-0.084155,-0.050596,-0.074713,0.816765,-0.130314,...,-0.070987,-0.928431,-0.021226,-0.004078,0.018882,-0.072448,0.154770,0.278052,7.0,30.0
1170509,0.988467,-1.169339,-2.021657,-0.963902,-1.282447,-0.090104,-0.061191,-0.079284,0.641779,-0.441958,...,-0.071034,-0.955654,-0.021364,-0.003161,0.018811,-0.073297,0.162159,0.277552,7.0,30.0


In [11]:


# Concat Train and test data, for better inference of missing values
combined_data = pd.concat([data,X_test])

imputer = KNNImputer(n_neighbors=100)
combined_data = pd.DataFrame(imputer.fit_transform(combined_data),columns = combined_data.columns)

combined_data[1170511:].to_csv("../../Data/Missing_KNN/X_test_100_C.csv", index = False)