In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

In [2]:
YEAR = 2021
df = pd.read_csv(f"preprocessed/{YEAR}.csv")
df.head(3)

Unnamed: 0,Country,Happiness Score,Economy,Social support,Healthy life expectancy,Freedom,Generosity,Perceptions of corruption,Dystopia Residual
0,Finland,7.842,10.775,0.954,72.0,0.949,-0.098,0.186,3.253
1,Denmark,7.62,10.933,0.954,72.7,0.946,0.03,0.179,2.868
2,Switzerland,7.571,11.117,0.942,74.4,0.919,0.025,0.292,2.839


문자를 포함한 항목(국가명)은 계산이 되지 않기 때문에 임시로 변수에 저장해둔다.  
0으로 표기된 값은 결측치로 판단하고 kNN을 활용해 결측치를 예측해 채운다.  

In [3]:
COLUMNS = df.columns
COUNTRY = df["Country"].values.reshape(-1, 1)

df.drop(labels="Country", axis=1, inplace=True)

imputer = KNNImputer(missing_values=0, n_neighbors=2, weights="distance")
filled = imputer.fit_transform(df)
filled

array([[ 7.842, 10.775,  0.954, ..., -0.098,  0.186,  3.253],
       [ 7.62 , 10.933,  0.954, ...,  0.03 ,  0.179,  2.868],
       [ 7.571, 11.117,  0.942, ...,  0.025,  0.292,  2.839],
       ...,
       [ 3.415,  7.676,  0.552, ...,  0.061,  0.167,  1.095],
       [ 3.145,  7.943,  0.75 , ..., -0.047,  0.821,  1.205],
       [ 2.523,  7.695,  0.463, ..., -0.102,  0.924,  1.895]])

In [4]:
new_df = np.concatenate((COUNTRY, filled), axis=1)
new_df = pd.DataFrame(new_df, columns=COLUMNS)

new_df.to_csv(f"preprocessed/{YEAR}.csv", index=False)

In [5]:
saved_df = pd.read_csv(f"preprocessed/{YEAR}.csv")
saved_df.head()

Unnamed: 0,Country,Happiness Score,Economy,Social support,Healthy life expectancy,Freedom,Generosity,Perceptions of corruption,Dystopia Residual
0,Finland,7.842,10.775,0.954,72.0,0.949,-0.098,0.186,3.253
1,Denmark,7.62,10.933,0.954,72.7,0.946,0.03,0.179,2.868
2,Switzerland,7.571,11.117,0.942,74.4,0.919,0.025,0.292,2.839
3,Iceland,7.554,10.878,0.983,73.0,0.955,0.16,0.673,2.967
4,Netherlands,7.464,10.932,0.942,72.4,0.913,0.175,0.338,2.798


---

In [6]:
df = pd.read_csv("healthy_lifestyle_city_2021.csv")
df.head(3)

Unnamed: 0,City,Rank,Sunshine hours(City),Cost of a bottle of water(City),Obesity levels(Country),Life expectancy(years) (Country),Pollution(Index score) (City),Annual avg. hours worked,Happiness levels(Country),Outdoor activities(City),Number of take out places(City),Cost of a monthly gym membership(City)
0,Amsterdam,1,1858,£1.92,20.40%,81.2,30.93,1434,7.44,422,1048,£34.90
1,Sydney,2,2636,£1.48,29.00%,82.1,26.86,1712,7.22,406,1103,£41.66
2,Vienna,3,1884,£1.94,20.10%,81.0,17.33,1501,7.29,132,1008,£25.74


In [7]:
import re

columns = [
    "Cost of a bottle of water(City)",
    "Cost of a monthly gym membership(City)",
    "Obesity levels(Country)",
]

for col in columns:
    df[col] = df[col].map(lambda x: re.sub(r"[^0-9.-]", "", str(x)))
    df[col] = df[col].astype(np.float_)

In [8]:
COLUMNS = df.columns
CITY = df["City"].values.reshape(-1, 1)

df.drop(labels="City", axis=1, inplace=True)

MISSING = "-"
df.replace(MISSING, np.nan, inplace=True)

imputer = KNNImputer(n_neighbors=2, weights="distance")
filled = imputer.fit_transform(df)
filled

array([[1.00000000e+00, 1.85800000e+03, 1.92000000e+00, 2.04000000e+01,
        8.12000000e+01, 3.09300000e+01, 1.43400000e+03, 7.44000000e+00,
        4.22000000e+02, 1.04800000e+03, 3.49000000e+01],
       [2.00000000e+00, 2.63600000e+03, 1.48000000e+00, 2.90000000e+01,
        8.21000000e+01, 2.68600000e+01, 1.71200000e+03, 7.22000000e+00,
        4.06000000e+02, 1.10300000e+03, 4.16600000e+01],
       [3.00000000e+00, 1.88400000e+03, 1.94000000e+00, 2.01000000e+01,
        8.10000000e+01, 1.73300000e+01, 1.50100000e+03, 7.29000000e+00,
        1.32000000e+02, 1.00800000e+03, 2.57400000e+01],
       [4.00000000e+00, 1.82100000e+03, 1.72000000e+00, 2.06000000e+01,
        8.18000000e+01, 1.96300000e+01, 1.45200000e+03, 7.35000000e+00,
        1.29000000e+02, 5.98000000e+02, 3.73100000e+01],
       [5.00000000e+00, 1.63000000e+03, 2.19000000e+00, 1.97000000e+01,
        7.98000000e+01, 2.12400000e+01, 1.38000000e+03, 7.64000000e+00,
        1.54000000e+02, 5.23000000e+02, 3.25300000e+

In [9]:
new_df = np.concatenate((CITY, filled), axis=1)
new_df = pd.DataFrame(new_df, columns=COLUMNS)

new_df.to_csv(f"preprocessed/healthy_lifestyle_city_2021.csv", index=False)

In [10]:
saved_df = pd.read_csv(f"preprocessed/healthy_lifestyle_city_2021.csv")
saved_df.head()

Unnamed: 0,City,Rank,Sunshine hours(City),Cost of a bottle of water(City),Obesity levels(Country),Life expectancy(years) (Country),Pollution(Index score) (City),Annual avg. hours worked,Happiness levels(Country),Outdoor activities(City),Number of take out places(City),Cost of a monthly gym membership(City)
0,Amsterdam,1.0,1858.0,1.92,20.4,81.2,30.93,1434.0,7.44,422.0,1048.0,34.9
1,Sydney,2.0,2636.0,1.48,29.0,82.1,26.86,1712.0,7.22,406.0,1103.0,41.66
2,Vienna,3.0,1884.0,1.94,20.1,81.0,17.33,1501.0,7.29,132.0,1008.0,25.74
3,Stockholm,4.0,1821.0,1.72,20.6,81.8,19.63,1452.0,7.35,129.0,598.0,37.31
4,Copenhagen,5.0,1630.0,2.19,19.7,79.8,21.24,1380.0,7.64,154.0,523.0,32.53
