In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from joblib import Parallel, delayed
from tqdm import tqdm


In [2]:
train_path: str = "../kaggle/input/drw-crypto-market-prediction/train.parquet"
test_path: str = "../kaggle/input/drw-crypto-market-prediction/test.parquet"

train_data = pd.read_parquet(train_path)
test_data = pd.read_parquet(test_path)


In [3]:
# 剔除零方差因子
zero_var_cols = train_data.columns[train_data.nunique() == 1]
train_data = train_data.drop(columns=zero_var_cols)
test_data = test_data.drop(columns=zero_var_cols)


In [4]:
# 准备数据
y = train_data['label']
X = train_data.drop(columns=['label']).fillna(0)

# 使用 joblib + tqdm 对 mutual_info_regression 并行处理每一列
def compute_mi_for_column(col):
    return mutual_info_regression(X[[col]], y, discrete_features='auto')[0]

# 使用 tqdm 包装列名以显示进度条
mi_scores = Parallel(n_jobs=-1)(
    delayed(compute_mi_for_column)(col) for col in tqdm(X.columns, desc="Computing Mutual Information")
)

# 组织结果为 Series
mi_ranking = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
selected_factor_names = mi_ranking.sort_values(ascending=False).head(20).index.tolist()

Computing Mutual Information: 100%|██████████| 868/868 [06:20<00:00,  2.28it/s]


In [5]:
y_train = train_data['label']
X_train = train_data[selected_factor_names].fillna(0)
y_test = test_data['label']
X_test = test_data[selected_factor_names].fillna(0)


import pandas as pd
import os

# 指定输出目录
output_dir = '../kaggle/input'
os.makedirs(output_dir, exist_ok=True)  # 如果目录不存在则创建

# 保存为 CSV 文件
X_train.to_parquet(os.path.join(output_dir, 'X_train.parquet'), index=False)
X_test.to_parquet(os.path.join(output_dir, 'X_test.parquet'), index=False)
# 将 Series 转换为 DataFrame（添加列名）
y_train.to_frame('label').to_parquet(os.path.join(output_dir, 'y_train.parquet'), index=False)
y_test.to_frame('label').to_parquet(os.path.join(output_dir, 'y_test.parquet'), index=False)

In [6]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 525887 entries, 2023-03-01 00:00:00 to 2024-02-29 23:59:00
Data columns (total 20 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   X853    525887 non-null  float64
 1   X854    525887 non-null  float64
 2   X862    525887 non-null  float64
 3   X881    525887 non-null  float64
 4   X880    525887 non-null  float64
 5   X879    525887 non-null  float64
 6   X137    525887 non-null  float64
 7   X882    525887 non-null  float64
 8   X873    525887 non-null  float64
 9   X95     525887 non-null  float64
 10  X179    525887 non-null  float64
 11  X140    525887 non-null  float64
 12  X98     525887 non-null  float64
 13  X182    525887 non-null  float64
 14  X889    525887 non-null  float64
 15  X883    525887 non-null  float64
 16  X878    525887 non-null  float64
 17  X181    525887 non-null  float64
 18  X384    525887 non-null  float64
 19  X387    525887 non-null  float64
dtypes: float64(20)

In [7]:
X_train_columns = X_train.columns.tolist()


In [8]:
X_train_columns

['X853',
 'X854',
 'X862',
 'X881',
 'X880',
 'X879',
 'X137',
 'X882',
 'X873',
 'X95',
 'X179',
 'X140',
 'X98',
 'X182',
 'X889',
 'X883',
 'X878',
 'X181',
 'X384',
 'X387']

In [9]:
# import pandas as pd
# import os

# # 列名列表
# name = ['X534', 'X520', 'X866', 'X856', 'X635', 'X552', 'X632', 'X584', 'X763', 'X850', 'X783', 'X388', 'X524', 'X755', 'X647', 'X589', 'X868', 'X597', 'X795', 'X346', 'X819', 'X394', 'X839', 'X556', 'X739', 'X569', 'X515', 'X543', 'X815', 'X861', 'X585', 'X512', 'X557', 'X573', 'X851', 'X614', 'X546', 'X536', 'X617', 'X854', 'X145', 'X759', 'X142', 'X598', 'X505', 'X563', 'X577', 'X542', 'X527', 'X396', 'X862', 'X545', 'X863', 'X811', 'X735', 'X559', 'X791', 'X747', 'X595', 'X144', 'X519', 'X582', 'X522', 'X564', 'X551', 'X537', 'X855', 'X561', 'X593', 'X576', 'X587', 'X844', 'X143', 'X592', 'X849', 'X779', 'X571', 'X807', 'X97', 'X562', 'X568', 'X857', 'X503', 'X767', 'X540', 'X827', 'X495', 'X521', 'X847', 'X591', 'X541', 'X141', 'X586', 'X391', 'ask_qty', 'X775', 'X501', 'X247', 'X648', 'X860', 'X554', 'X535', 'X835', 'X496', 'X528', 'X566', 'X189', 'X578', 'X517', 'X859', 'X548', 'X544', 'label', 'X743', 'X771', 'X843', 'X550', 'X500', 'X623', 'X570', 'X507', 'X846', 'X508', 'X787', 'X499', 'X852', 'X498', 'X538', 'X858', 'X506', 'X555', 'X823', 'X620', 'X848', 'X590', 'X583', 'X549', 'X799', 'X547', 'X831', 'X853', 'X751', 'X803', 'X845', 'X865', 'X514', 'X390', 'X392', 'X575', 'X645', 'X502', 'X529', 'X644']

# # 文件路径
# X_train_path = "../kaggle/input/X_train.parquet"
# X_test_path = "../kaggle/input/X_test.parquet"

# # 检查文件是否存在
# for file_path in [X_train_path, X_test_path]:
#     if not os.path.exists(file_path):
#         print(f"错误: 文件 {file_path} 不存在")
#         exit(1)

# try:
#     # 读取文件
#     X_train = pd.read_parquet(X_train_path)
#     X_test = pd.read_parquet(X_test_path)
    
#     # 获取实际存在的列名
#     X_train_columns = X_train.columns.tolist()
#     X_test_columns = X_test.columns.tolist()
    
#     # 筛选列名
#     X_train_common = [col for col in name if col in X_train_columns]
#     X_test_common = [col for col in name if col in X_test_columns]
    
#     # 筛选数据
#     if X_train_common:
#         X_train = X_train[X_train_common].fillna(0)
#         print(f"X_train 筛选出 {len(X_train_common)} 列")
#     else:
#         print("警告: X_train 中未找到匹配的列")
    
#     if X_test_common:
#         X_test = X_test[X_test_common].fillna(0)
#         print(f"X_test 筛选出 {len(X_test_common)} 列")
#     else:
#         print("警告: X_test 中未找到匹配的列")
    
#     # 覆盖保存文件
#     X_train.to_parquet(X_train_path)
#     X_test.to_parquet(X_test_path)
    
#     print("操作完成，文件已成功覆盖保存")

# except Exception as e:
#     print(f"发生错误: {e}")

# X_train.shape

In [10]:
X_train.columns.tolist()

['X853',
 'X854',
 'X862',
 'X881',
 'X880',
 'X879',
 'X137',
 'X882',
 'X873',
 'X95',
 'X179',
 'X140',
 'X98',
 'X182',
 'X889',
 'X883',
 'X878',
 'X181',
 'X384',
 'X387']