In [1]:
import numpy as np
import pandas as pd
import copy

In [2]:
data = pd.read_csv('PRSA_data.csv')
pm25 = data.loc[:, "pm2.5"].values.reshape(-1, 1)  #sklearn当中特征矩阵必须是二维

from sklearn.impute import SimpleImputer

## 使用均值填补缺失数据(数值型数据)

In [64]:

data_mean = copy.deepcopy(data)

pm25_mean = SimpleImputer()
pm25_mean = pm25_mean.fit_transform(pm25)

data_mean.loc[:, "pm2.5"] = pm25_mean

## 使用中位数进行填补

In [65]:

data_median = copy.deepcopy(data)

pm25_median = SimpleImputer(strategy="median")  # 使用中位数填补

pm25_median = pm25_median.fit_transform(pm25)

data_median.loc[:, "pm2.5"] = pm25_median

## 使用0进行填补

In [66]:

data_zero = copy.deepcopy(data)

pm25_zero = SimpleImputer(strategy="constant", fill_value=0)  # 使用中位数填补

pm25_zero = pm25_zero.fit_transform(pm25)

data_zero.loc[:, "pm2.5"] = pm25_zero

## 使用众数进行填补

In [67]:

data_mode = copy.deepcopy(data)

pm25_mode = SimpleImputer(strategy="most_frequent")  # 使用中位数填补

pm25_mode = pm25_mode.fit_transform(pm25)

data_mode.loc[:, "pm2.5"] = pm25_mode


## 使用KNN进行填补

In [68]:

from sklearn.impute import KNNImputer

data_knn = copy.deepcopy(data)

pm25_pd = data.loc[:, "pm2.5"]

pm25_knn = KNNImputer(n_neighbors=5)

pm25_knn = pm25_knn.fit_transform(pm25_pd.reset_index())

data_knn.loc[:, "pm2.5"] = pm25_knn[:, 1]


## 使用插值方法进行填补

In [4]:
import pandas as pd
import copy
from scipy import interpolate

data = pd.read_csv('PRSA_data.csv')
data = data.fillna(value=np.nan)

In [6]:
pm25_data = data.loc[:, "pm2.5"].values.reshape(-1, 1)
nan_index, _ = np.where(np.isnan(pm25_data))
non_nan_index, _ = np.where(~np.isnan(pm25_data))

#### 用变量DEWP进行填补

In [14]:
x = data.DEWP.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.DEWP.values[nan_index])
data_interpolation_DEWP = copy.deepcopy(data)
data_interpolation_DEWP.loc[nan_index, 'pm2.5'] = y_new

#### 用变量TEMP进行填补

In [None]:
x = data.TEMP.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.TEMP.values[nan_index])
data_interpolation_TEMP = copy.deepcopy(data)
data_interpolation_TEMP.loc[nan_index, 'pm2.5'] = y_new

#### 用变量PRES进行填补

In [None]:
x = data.PRES.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.PRES.values[nan_index])
data_interpolation_PRES = copy.deepcopy(data)
data_interpolation_PRES.loc[nan_index, 'pm2.5'] = y_new

#### 用变量Iws进行填补

In [None]:
x = data.Iws.values[non_nan_index]
y = data['pm2.5'].values[non_nan_index]
f = interpolate.interp1d(x, y)
y_new = f(data.Iws.values[nan_index])
data_interpolation_Iws = copy.deepcopy(data)
data_interpolation_Iws.loc[nan_index, 'pm2.5'] = y_new

> 因为 Is, Ir 变量存在很多0，故插值效果很差，这里就没有用这两个变量进行插值