In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [2]:
YEAR = 2021
df = pd.read_csv(f"data/{YEAR}.csv")
df.head(3)

Unnamed: 0,Country,Happiness Score,Economy,Social support,Healthy life expectancy,Freedom,Generosity,Perceptions of corruption,Dystopia Residual
0,Finland,7.842,10.775,0.954,72.0,0.949,-0.098,0.186,3.253
1,Denmark,7.62,10.933,0.954,72.7,0.946,0.03,0.179,2.868
2,Switzerland,7.571,11.117,0.942,74.4,0.919,0.025,0.292,2.839


문자를 포함한 항목(국가명)은 계산이 되지 않기 때문에 임시로 변수에 저장해둔다.  
0으로 표기된 값은 결측치로 판단하고 kNN을 활용해 결측치를 예측해 채운다.  

In [3]:
df = df.replace(0, np.NaN)

COLUMNS = df.columns
COUNTRY = df["Country"].values.reshape(-1, 1)

df.drop(labels="Country", axis=1, inplace=True)

imputer = KNNImputer(n_neighbors=2, weights="distance")
filled = imputer.fit_transform(df)
filled

array([[ 7.842, 10.775,  0.954, ..., -0.098,  0.186,  3.253],
       [ 7.62 , 10.933,  0.954, ...,  0.03 ,  0.179,  2.868],
       [ 7.571, 11.117,  0.942, ...,  0.025,  0.292,  2.839],
       ...,
       [ 3.415,  7.676,  0.552, ...,  0.061,  0.167,  1.095],
       [ 3.145,  7.943,  0.75 , ..., -0.047,  0.821,  1.205],
       [ 2.523,  7.695,  0.463, ..., -0.102,  0.924,  1.895]])

In [4]:
new_df = np.concatenate((COUNTRY, filled), axis=1)
new_df = pd.DataFrame(new_df, columns=COLUMNS)

new_df.to_csv(f"data/{YEAR}.csv", index=False)

In [5]:
saved_df = pd.read_csv(f"data/{YEAR}.csv")
saved_df.head()

Unnamed: 0,Country,Happiness Score,Economy,Social support,Healthy life expectancy,Freedom,Generosity,Perceptions of corruption,Dystopia Residual
0,Finland,7.842,10.775,0.954,72.0,0.949,-0.098,0.186,3.253
1,Denmark,7.62,10.933,0.954,72.7,0.946,0.03,0.179,2.868
2,Switzerland,7.571,11.117,0.942,74.4,0.919,0.025,0.292,2.839
3,Iceland,7.554,10.878,0.983,73.0,0.955,0.16,0.673,2.967
4,Netherlands,7.464,10.932,0.942,72.4,0.913,0.175,0.338,2.798
