In [1]:
#general imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#imputation tools
from sklearn.impute import KNNImputer

In [2]:
# establish directory locations
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
data_folder = parent_directory + '/data/'
submissions_folder = parent_directory + '/submissions/'

In [3]:
df = pd.read_csv(data_folder + 'data.csv')

In [4]:
df.head()

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31644 entries, 0 to 31643
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   31644 non-null  int64  
 1   author               26620 non-null  object 
 2   geometry             26144 non-null  object 
 3   pressure [MPa]       27192 non-null  float64
 4   mass_flux [kg/m2-s]  26853 non-null  float64
 5   x_e_out [-]          21229 non-null  float64
 6   D_e [mm]             26156 non-null  float64
 7   D_h [mm]             27055 non-null  float64
 8   length [mm]          26885 non-null  float64
 9   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 2.4+ MB


### EDA

In [6]:
df.corr()

Unnamed: 0,id,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
id,1.0,0.005451,0.008136,-0.000436,0.003581,0.001305,0.007198,0.002726
pressure [MPa],0.005451,1.0,-0.195332,-0.193125,-0.468037,-0.498645,-0.090388,-0.259936
mass_flux [kg/m2-s],0.008136,-0.195332,1.0,-0.168136,0.004676,-0.180331,-0.055095,0.308971
x_e_out [-],-0.000436,-0.193125,-0.168136,1.0,0.124835,0.063367,0.33684,-0.37058
D_e [mm],0.003581,-0.468037,0.004676,0.124835,1.0,0.494538,0.314969,0.019495
D_h [mm],0.001305,-0.498645,-0.180331,0.063367,0.494538,1.0,0.113241,0.055734
length [mm],0.007198,-0.090388,-0.055095,0.33684,0.314969,0.113241,1.0,-0.276146
chf_exp [MW/m2],0.002726,-0.259936,0.308971,-0.37058,0.019495,0.055734,-0.276146,1.0


### Data Preparation

In [7]:
inference_df = df
inference_df = inference_df.drop(columns = ['id', 'author', 'geometry'])
column_names = inference_df.columns

### Scikit Learn's KNNImputer

In [8]:
kNN_1 = KNNImputer(n_neighbors=1)

kNN_1_np = kNN_1.fit_transform(inference_df)
kNN_1_df = pd.DataFrame(kNN_1_np, columns = column_names)

In [9]:
kNN_3 = KNNImputer(n_neighbors=3)

kNN_3_np = kNN_3.fit_transform(inference_df)
kNN_3_df = pd.DataFrame(kNN_3_np, columns = column_names)

In [10]:
kNN_5 = KNNImputer(n_neighbors=5)

kNN_5_np = kNN_5.fit_transform(inference_df)
kNN_5_df = pd.DataFrame(kNN_5_np, columns = column_names)

In [11]:
kNN_7 = KNNImputer(n_neighbors=7)

kNN_7_np = kNN_7.fit_transform(inference_df)
kNN_7_df = pd.DataFrame(kNN_7_np, columns = column_names)

In [12]:
kNN_9 = KNNImputer(n_neighbors=9)

kNN_9_np = kNN_9.fit_transform(inference_df)
kNN_9_df = pd.DataFrame(kNN_9_np, columns = column_names)

### Output Submission Files

In [13]:
# seperate target inference records
df['inf'] = df['x_e_out [-]'].isna()
df['knn_1'] = kNN_1_df['x_e_out [-]']
df['knn_3'] = kNN_3_df['x_e_out [-]']
df['knn_5'] = kNN_5_df['x_e_out [-]']
df['knn_7'] = kNN_7_df['x_e_out [-]']
df['knn_9'] = kNN_9_df['x_e_out [-]']

In [14]:
inference_df = df[df['inf'] == True]

In [15]:
inference_df.shape

(10415, 16)

In [16]:
inference_df[['id', 'knn_1']].to_csv(submissions_folder + 'V1_KNNImputer_1.csv', 
                                    header = ['id', 'x_e_out [-]'], 
                                    index = False)

In [17]:
inference_df[['id', 'knn_3']].to_csv(submissions_folder + 'V1_KNNImputer_3.csv', 
                                     header = ['id', 'x_e_out [-]'], 
                                     index = False)

In [18]:
inference_df[['id', 'knn_5']].to_csv(submissions_folder + 'V1_KNNImputer_5.csv',
                                     header = ['id', 'x_e_out [-]'], 
                                     index = False)

In [19]:
inference_df[['id', 'knn_7']].to_csv(submissions_folder + 'V1_KNNImputer_7.csv',
                                     header = ['id', 'x_e_out [-]'], 
                                     index = False)

In [20]:
inference_df[['id', 'knn_9']].to_csv(submissions_folder + 'V1_KNNImputer_9.csv',
                                     header = ['id', 'x_e_out [-]'], 
                                     index = False)

### Performance Review

Kaggle reports RMSE's of:
 - 0.106434 for the V1_KNNImputer_1.csv
 - 0.089169 for the V1_KNNImputer_3.csv
 - 0.08483 for the file V1_KNNImputer_5.csv
 - 0.082447 for the file V1_KNNImputer_7.csv
 - TBS for the file V1_KNNImputer_9.csv
