In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [30]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [31]:
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)

# ff = ['Id', 'AH', 'BN', 'BQ', 'CB', 'CC', 'CR', 'CU', 'DA', 'DE', 'DN', 'EE', 'EP', 'FI', 'GF', 'Class']
# gg = ['Id', 'AH', 'BN', 'BQ', 'CB', 'CC', 'CR', 'CU', 'DA', 'DE', 'DN', 'EE', 'EP', 'FI', 'GF']
# train = train[ff]
# test = test[gg]

In [32]:
Imp = SimpleImputer(missing_values=np.nan, strategy='median')

columns_to_select = [col for col in train.columns if col not in ['Class', 'Id']]

train_data = train[columns_to_select].copy()
test_data = test[columns_to_select].copy()

# 填充缺失值
train_data = pd.DataFrame(Imp.fit_transform(train_data), columns=columns_to_select)
test_data = pd.DataFrame(Imp.transform(test_data), columns=columns_to_select)

# 重新组合数据和原始列
train_filled = pd.concat([train['Id'], train_data, train['Class']], axis=1)
test_filled = pd.concat([test['Id'], test_data], axis=1)

train = train_filled.copy()
test = test_filled.copy()
# print(train.shape, test.shape)

In [33]:
from sklearn.cluster import KMeans

k = 5
BNpd = train['BN']

BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
data = BNpd.values.reshape(-1, 1)
kmodel = KMeans(n_clusters=k)           # k为聚成几类
kmodel.fit(data)  # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) # 求聚类中心
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

# 求聚类中心之间的平均值作为分割点
for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(train['BN'].max(), test['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

# 保留旧BN，添加BN_binning
train_BN = train['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True)
train['BN_binning'] = train_binning

test_BN = test['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True)
test['BN_binning'] = test_binning

In [None]:
y_meta = greeks['Alpha'].values
X = train.drop(columns=['Id', 'Class'])
y = train['Class']

In [34]:
# RFE寻找可以删除的特征

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

X = train.drop(columns=['Id', 'Class'])
y = train['Class']

model = RandomForestClassifier(random_state=1)
rfe = RFE(model, n_features_to_select=45)
rfe = rfe.fit(X, y)

features_selected = X.columns[rfe.support_]
all_features = X.columns

not_selected_features = set(all_features) - set(features_selected)

print(features_selected)
print(not_selected_features)

X_new = rfe.transform(X)
test_new = rfe.transform(test.drop(columns=['Id']))

Index(['AB', 'AF', 'AM', 'AX', 'AZ', 'BC', 'BD ', 'BN', 'BP', 'BQ', 'BR', 'CB',
       'CC', 'CD ', 'CF', 'CH', 'CR', 'CS', 'CU', 'DA', 'DE', 'DF', 'DH', 'DI',
       'DL', 'DN', 'DU', 'DY', 'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'FC', 'FD ',
       'FE', 'FI', 'FL', 'FR', 'GB', 'GE', 'GF', 'GH', 'GL'],
      dtype='object')
{'EU', 'EJ', 'AY', 'BZ', 'AH', 'FS', 'CW ', 'GI', 'CL', 'BN_binning', 'AR', 'DV'}


In [35]:
# 通过线性回归找到可以进行多项式组合的特征

import itertools
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

train = train.drop(['Id'],axis=1)

# 列名列表，所有特征列
all_features = train.columns

# 用于存储特征之间是否存在非线性关系或交互关系的结果
result = []

# 遍历每两个特征的组合
for feature_pair in itertools.combinations(all_features, 2):
    feature_1, feature_2 = feature_pair
    
    # 创建包含两个特征的DataFrame
    features_df = train[[feature_1, feature_2]]
    
    # 使用多项式特征衍生
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(features_df)
    
    # 拟合线性回归模型
    model = LinearRegression()
    model.fit(poly_features, train['Class'])  # 将目标变量替换为你的目标变量列名
    
    # 检查模型的性能或某些条件，判断是否存在非线性关系或交互关系
    # 这里仅仅是一个示例，你可以根据你的具体需求进行判断
    if model.score(poly_features, train['Class']) > 0.26 and feature_1 != 'Class' and feature_2 != 'Class':
        result.append((feature_1, feature_2, '非线性/交互关系'))
    # else:
    #     result.append((feature_1, feature_2, '无关系'))

# 将结果转换为 DataFrame
result_df = pd.DataFrame(result, columns=['Feature 1', 'Feature 2', '关系类型'])

# 打印结果
print(result_df)


  Feature 1 Feature 2      关系类型
0        AF        DU  非线性/交互关系
1        DI        DU  非线性/交互关系
