导入所需包，载入数据。

In [1]:
import pandas as pd
import numpy as np
import pickle
import preprocessing
from datetime import datetime

In [2]:
with open('./data/train_data.pkl', 'rb') as f:
    train_data_raw = pickle.load(f)
with open('./data/test_data.pkl', 'rb') as f:
    test_data_raw = pickle.load(f)

划分训练集和验证集。

In [3]:
split_date = round(train_data_raw['date'].unique().shape[0] * 0.9)
valid_data_raw = train_data_raw.loc[train_data_raw['date'] >= split_date]
train_data_raw = train_data_raw.loc[train_data_raw['date'] < split_date]

In [4]:
X_train_raw = train_data_raw.drop(columns=['y']).reset_index(drop=True)
y_train = train_data_raw['y'].reset_index(drop=True)
X_valid_raw = valid_data_raw.drop(columns=['y']).reset_index(drop=True)
y_valid = valid_data_raw['y'].reset_index(drop=True)
X_test_raw = test_data_raw.reset_index(drop=True)

预处理1：对训练集和验证集，因为f_6是类别变量，先对f_6使用target_encoding编码。

In [7]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_encoded, target_encoder = preprocessing.target_encode(X_train_raw, 'f_6', y_train)
X_valid_encoded = preprocessing.target_encode(X_valid_raw, 'f_6', encoder=target_encoder)
X_test_encoded = preprocessing.target_encode(X_test_raw, 'f_6', encoder=target_encoder)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-10 11:15:32.144371
结束时间为: 2024-04-10 11:15:35.684821
处理时间为： 0:00:03.540450


预处理2-1：对训练集和验证集编码后，在截面上用KNN填补缺失值，基于训练集数据进行缩尾，并在截面上用ZScore标准化，用于不能处理缺失值的模型。

In [8]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_filled = preprocessing.imputer_KNN(X_train_encoded)
X_valid_filled = preprocessing.imputer_KNN(X_valid_encoded)
X_test_filled = preprocessing.imputer_KNN(X_test_encoded)

X_train_filled_win, lower_bound_filled, upper_bound_filled = preprocessing.winsorize_X(X_train_filled)
X_valid_filled_win = preprocessing.winsorize_X(X_valid_filled, lower_bound_filled, upper_bound_filled)
X_test_filled_win = preprocessing.winsorize_X(X_test_filled, lower_bound_filled, upper_bound_filled)

X_train_filled_cleaned = preprocessing.zscore_standardization(X_train_filled_win)
X_valid_filled_cleaned = preprocessing.zscore_standardization(X_valid_filled_win)
X_test_filled_cleaned = preprocessing.zscore_standardization(X_test_filled_win)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-10 11:15:37.869860
结束时间为: 2024-04-10 11:30:19.263335
处理时间为： 0:14:41.393475


预处理2-2：对训练集和验证集编码后，直接基于训练集数据进行缩尾，并在截面上用ZScore标准化，用于可以处理缺失值的模型。

In [9]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_unfilled_win, lower_bound_unfilled, upper_bound_unfilled = preprocessing.winsorize_X(X_train_encoded)
X_valid_unfilled_win = preprocessing.winsorize_X(X_valid_encoded, lower_bound_unfilled, upper_bound_unfilled)
X_test_unfilled_win = preprocessing.winsorize_X(X_test_encoded, lower_bound_unfilled, upper_bound_unfilled)

X_train_unfilled_cleaned = preprocessing.zscore_standardization(X_train_unfilled_win)
X_valid_unfilled_cleaned = preprocessing.zscore_standardization(X_valid_unfilled_win)
X_test_unfilled_cleaned = preprocessing.zscore_standardization(X_test_unfilled_win)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-10 11:30:19.319911
结束时间为: 2024-04-10 11:33:34.068937
处理时间为： 0:03:14.749026


预处理3：对于填充了缺失值的数据，用互信息法尝试进行特征选择，选出10个特征，留作与未经选择的特征的效果进行比较。

In [10]:
start_time = datetime.now()
print("开始时间为:", start_time)

X_train_filled_selected, mi_selector = preprocessing.mutual_info_selection(X_train_filled_cleaned, y_train)
X_valid_filled_selected = preprocessing.mutual_info_selection(X_valid_filled_cleaned, selector=mi_selector)
X_test_filled_selected = preprocessing.mutual_info_selection(X_test_filled_cleaned, selector=mi_selector)

end_time = datetime.now()
print("结束时间为:", end_time)
print("处理时间为：", end_time - start_time)

开始时间为: 2024-04-10 11:33:34.104939
结束时间为: 2024-04-10 12:02:55.749086
处理时间为： 0:29:21.644147


保存所得数据。

In [11]:
with open('./data/X_train_filled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_train_filled_cleaned, f)
with open('./data/X_valid_filled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_valid_filled_cleaned, f)
with open('./data/X_test_filled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_test_filled_cleaned, f)
with open('./data/X_train_unfilled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_train_unfilled_cleaned, f)
with open('./data/X_valid_unfilled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_valid_unfilled_cleaned, f) 
with open('./data/X_test_unfilled_cleaned.pkl', 'wb') as f:
    pickle.dump(X_test_unfilled_cleaned, f) 
with open('./data/X_train_filled_selected.pkl', 'wb') as f:
    pickle.dump(X_train_filled_selected, f)
with open('./data/X_valid_filled_selected.pkl', 'wb') as f:
    pickle.dump(X_valid_filled_selected, f)
with open('./data/X_test_filled_selected.pkl', 'wb') as f:
    pickle.dump(X_test_filled_selected, f)
with open('./data/y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)
with open('./data/y_valid.pkl', 'wb') as f:
    pickle.dump(y_valid, f)   