In [1]:
import numpy as np
import pandas as pd
import copy

In [2]:
data = pd.read_csv('PRSA_data.csv')
pm25 = data.loc[:, "pm2.5"].values.reshape(-1, 1)  #sklearn当中特征矩阵必须是二维

from sklearn.impute import SimpleImputer

In [None]:
# one hot encoder

import copy

cbwd_one_hot = dict(zip(set(data['cbwd']), range(4)))
cbwd_one_hot_inverse = dict(zip(range(4), set(data['cbwd'])))

X_cbwd = copy.deepcopy(data['cbwd'].values)
X_cbwd_new = np.asarray(X_cbwd)
for i, item in enumerate(X_cbwd):
    X_cbwd_new[i] = cbwd_one_hot[item]

data['cbwd'] = X_cbwd_new

In [None]:
# nan index and non-nan index

pm25_data = data.loc[:, "pm2.5"].values.reshape(-1, 1)
nan_index, _ = np.where(np.isnan(pm25_data))
non_nan_index, _ = np.where(~np.isnan(pm25_data))
X, y = data.iloc[non_nan_index, 6:], data.iloc[non_nan_index, 5]

## 使用均值填补缺失数据(数值型数据)

In [None]:

data_mean = copy.deepcopy(data)

pm25_mean = SimpleImputer()
pm25_mean = pm25_mean.fit_transform(pm25)

data_mean.loc[:, "pm2.5"] = pm25_mean

## 使用中位数进行填补

In [None]:

data_median = copy.deepcopy(data)

pm25_median = SimpleImputer(strategy="median")  # 使用中位数填补

pm25_median = pm25_median.fit_transform(pm25)

data_median.loc[:, "pm2.5"] = pm25_median

## 使用0进行填补

In [None]:

data_zero = copy.deepcopy(data)

pm25_zero = SimpleImputer(strategy="constant", fill_value=0)  # 使用中位数填补

pm25_zero = pm25_zero.fit_transform(pm25)

data_zero.loc[:, "pm2.5"] = pm25_zero

## 使用众数进行填补

In [None]:

data_mode = copy.deepcopy(data)

pm25_mode = SimpleImputer(strategy="most_frequent")  # 使用中位数填补

pm25_mode = pm25_mode.fit_transform(pm25)

data_mode.loc[:, "pm2.5"] = pm25_mode


## 使用KNN进行填补

In [None]:
from sklearn.neighbors import KNeighborsRegressor

X = data.iloc[:, 6:].values[non_nan_index]
y = data.iloc[:, 5].values[non_nan_index]
KNR = KNeighborsRegressor(n_neighbors=5).fit(X, y)
data_knn = copy.deepcopy(data)
data_knn.loc[nan_index, 'pm2.5'] = KNR.predict(data.iloc[:, 6:].values[nan_index])

In [None]:
data.iloc[:, 6].values[nan_index]

## 使用插值方法进行填补

In [None]:
import pandas as pd
import copy
from scipy import interpolate

data = pd.read_csv('PRSA_data.csv')
data = data.fillna(value=np.nan)

In [None]:
pm25_data = data.loc[:, "pm2.5"].values.reshape(-1, 1)
nan_index, _ = np.where(np.isnan(pm25_data))
non_nan_index, _ = np.where(~np.isnan(pm25_data))

#### 用变量DEWP进行填补

In [None]:
x = data.DEWP.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.DEWP.values[nan_index])
data_interpolation_DEWP = copy.deepcopy(data)
data_interpolation_DEWP.loc[nan_index, 'pm2.5'] = y_new

#### 用变量TEMP进行填补

In [None]:
x = data.TEMP.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.TEMP.values[nan_index])
data_interpolation_TEMP = copy.deepcopy(data)
data_interpolation_TEMP.loc[nan_index, 'pm2.5'] = y_new

#### 用变量PRES进行填补

In [None]:
x = data.PRES.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.PRES.values[nan_index])
data_interpolation_PRES = copy.deepcopy(data)
data_interpolation_PRES.loc[nan_index, 'pm2.5'] = y_new