In [1]:
#csvファイルの読み込み
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df_train = pd.read_csv(r"C:\Users\eikik\python\python_dat\equity-post-HCT-survival-predictions\train.csv")

In [2]:
#データの確認
df_train.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223


In [3]:
#データの次元の確認
df_train.shape

(28800, 60)

In [4]:
#データの情報
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      28800 non-null  int64  
 1   dri_score               28646 non-null  object 
 2   psych_disturb           26738 non-null  object 
 3   cyto_score              20732 non-null  object 
 4   diabetes                26681 non-null  object 
 5   hla_match_c_high        24180 non-null  float64
 6   hla_high_res_8          22971 non-null  float64
 7   tbi_status              28800 non-null  object 
 8   arrhythmia              26598 non-null  object 
 9   hla_low_res_6           25530 non-null  float64
 10  graft_type              28800 non-null  object 
 11  vent_hist               28541 non-null  object 
 12  renal_issue             26885 non-null  object 
 13  pulm_severe             26665 non-null  object 
 14  prim_disease_hct        28800 non-null

In [5]:
#dri_scoreのデータ欠損割合を表示
df_train["dri_score"].isnull().sum()/df_train.shape[0]

0.005347222222222222

In [7]:
#欠損値割合が小さいので、driver_scoreの欠損値を試しに最頻値で埋めてみる

# 最頻値を計算
dri_score_mode = df_train['dri_score'].mode()[0]

# 欠損値を最頻値で補完
df_train['dri_score'] = df_train['dri_score'].fillna(dri_score_mode)

# 補完後の欠損値の確認
missing_after = df_train['dri_score'].isnull().sum()
print(f"補完後の'dri_score'の欠損値数: {missing_after}")


補完後の'dri_score'の欠損値数: 0


In [8]:
#df_trainのデータの欠損割合の表示
missing_rate = df_train.isnull().sum() / len(df_train)
#多い順に並べ替え
missing_rate = missing_rate.sort_values(ascending=False)
missing_rate = missing_rate[missing_rate > 0]
missing_rate


tce_match                 0.659583
mrd_hct                   0.576285
cyto_score_detail         0.413993
tce_div_match             0.395694
tce_imm_match             0.386562
cyto_score                0.280139
hla_high_res_10           0.248715
hla_high_res_8            0.202396
hla_high_res_6            0.183472
hla_match_dqb1_high       0.180521
hla_low_res_10            0.175833
conditioning_intensity    0.166285
hla_match_c_high          0.160417
hla_match_a_high          0.149340
hla_nmdp_6                0.145729
hla_match_dqb1_low        0.145625
hla_match_b_high          0.141944
hla_low_res_8             0.126840
hla_match_drb1_high       0.116389
hla_low_res_6             0.113542
hla_match_c_low           0.097222
hla_match_drb1_low        0.091771
hla_match_b_low           0.089063
cardiac                   0.088264
peptic_ulcer              0.083993
hla_match_a_low           0.082986
arrhythmia                0.076458
rheum_issue               0.075799
rituximab           

In [9]:
import lifelines
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from lifelines import KaplanMeierFitter
from sklearn.metrics import mean_squared_error
import warnings

In [10]:
# 生存確率を計算する関数
# efs：無イベント生存率
# efs_time：無イベント生存までの時間
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter() # インスタンスを作成
    kmf.fit(df[time_col], event_observed=df[event_col]) # カプラン・マイヤー推定量にデータをフィット
    survival_probabilities = kmf.survival_function_at_times(df[time_col]).values.flatten()
    censored_mask = df[event_col] == 0
    return survival_probabilities

In [11]:
# 生存確率を目的変数として代入
df_train["target"] = transform_survival_probability(df_train, time_col='efs_time', event_col='efs')
df_train.head(  )

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,target
0,0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,...,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,42.356,0.458687
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,1.0,4.672,0.847759
2,2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,...,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,19.793,0.462424
3,3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,...,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.0,102.349,0.456661
4,4,High,No,,No,2.0,8.0,No TBI,No,6.0,...,Related,MEL,8.0,No,2.0,No,10.0,0.0,16.223,0.464674


In [12]:
# 不要になったカラム[ID,efs,efs_time]を削除
drop_cols = ["ID", 'efs', 'efs_time']

# drop_colsに含まれるカラムを削除
df_train = df_train.drop(columns=[col for col in drop_cols if col in df_train.columns])
df_train.head()

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,target
0,N/A - non-malignant indication,No,,No,,,No TBI,No,6.0,Bone marrow,...,No,,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.458687
1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,Peripheral blood,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,Yes,10.0,0.847759
2,N/A - non-malignant indication,No,,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,No,Permissive mismatched,Related,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.462424
3,High,No,Intermediate,No,2.0,8.0,No TBI,No,6.0,Bone marrow,...,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0,0.456661
4,High,No,,No,2.0,8.0,No TBI,No,6.0,Peripheral blood,...,No,Permissive mismatched,Related,MEL,8.0,No,2.0,No,10.0,0.464674


欠損値を埋める基準として以下のものを考える。

・欠損割合が0.5を超えるものは削除
・欠損割合が0.1を下回るものは最頻値で補完（全体に対する影響が小さいと判断）
・欠損割合が上記の間に入っていて、かつデータ型がintかfloatのものは、特徴量間で相関係数を計算する。
その相関係数が0.8以上くらいで高ければ、単回帰分析をして、それぞれ欠損値を補完
・上記のいずれにも入らないものは別途考える。(knnで補完してみたかったが、エラーになってしまったので、大体最頻値になってしまった)

In [13]:
#欠損の割合が0.5以上の列を削除
df_train = df_train.dropna(thresh=len(df_train)*0.5, axis=1)

In [14]:
#欠損の割合が0.10以下の列を抽出
low_missing_rate = missing_rate[missing_rate < 0.10]
low_missing_rate

hla_match_c_low       0.097222
hla_match_drb1_low    0.091771
hla_match_b_low       0.089063
cardiac               0.088264
peptic_ulcer          0.083993
hla_match_a_low       0.082986
arrhythmia            0.076458
rheum_issue           0.075799
rituximab             0.074583
pulm_severe           0.074132
diabetes              0.073576
psych_disturb         0.071597
pulm_moderate         0.071076
hepatic_mild          0.066562
renal_issue           0.066493
hepatic_severe        0.064965
donor_age             0.062778
obesity               0.061111
prior_tumor           0.058264
melphalan_dose        0.048785
karnofsky_score       0.030208
cmv_status            0.022014
ethnicity             0.020382
comorbidity_score     0.016563
sex_match             0.009062
vent_hist             0.008993
in_vivo_tcd           0.007812
gvhd_proph            0.007812
donor_related         0.005486
dtype: float64

In [15]:
#low_missing_rateの列は最頻値で補完
for column in low_missing_rate.index:
    mode = df_train[column].mode()[0]
    df_train[column] = df_train[column].fillna(mode)

In [16]:
#更新したdf_trainの欠損率の確認
missing_rate = df_train.isnull().sum() / len(df_train)

#欠損率が0.5以上のもの、0.1以下のものがないことを確認
missing_rate_2 = missing_rate[missing_rate > 0]
missing_rate_2 = missing_rate_2.sort_values(ascending=False)
missing_rate_2


cyto_score_detail         0.413993
tce_div_match             0.395694
tce_imm_match             0.386562
cyto_score                0.280139
hla_high_res_10           0.248715
hla_high_res_8            0.202396
hla_high_res_6            0.183472
hla_match_dqb1_high       0.180521
hla_low_res_10            0.175833
conditioning_intensity    0.166285
hla_match_c_high          0.160417
hla_match_a_high          0.149340
hla_nmdp_6                0.145729
hla_match_dqb1_low        0.145625
hla_match_b_high          0.141944
hla_low_res_8             0.126840
hla_match_drb1_high       0.116389
hla_low_res_6             0.113542
dtype: float64

In [17]:
# df_trainからmissing_rateの列を抽出
missing_rate_columns = missing_rate_2.index

# missing_rate_columnsの列のうち、df_trainから同じ列名の列を抽出
df_missing_rate = df_train[missing_rate_columns]

df_missing_rate

Unnamed: 0,cyto_score_detail,tce_div_match,tce_imm_match,cyto_score,hla_high_res_10,hla_high_res_8,hla_high_res_6,hla_match_dqb1_high,hla_low_res_10,conditioning_intensity,hla_match_c_high,hla_match_a_high,hla_nmdp_6,hla_match_dqb1_low,hla_match_b_high,hla_low_res_8,hla_match_drb1_high,hla_low_res_6
0,,,,,,,6.0,2.0,10.0,,,2.0,6.0,2.0,2.0,8.0,2.0,6.0
1,Intermediate,Permissive mismatched,P/P,Intermediate,10.0,8.0,6.0,2.0,10.0,MAC,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
2,,Permissive mismatched,P/P,,10.0,8.0,6.0,2.0,10.0,,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
3,Intermediate,Permissive mismatched,P/P,Intermediate,10.0,8.0,6.0,2.0,10.0,MAC,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
4,,Permissive mismatched,,,10.0,8.0,6.0,2.0,10.0,MAC,2.0,2.0,5.0,2.0,2.0,8.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,Intermediate,Bi-directional non-permissive,P/P,Favorable,10.0,8.0,6.0,2.0,10.0,MAC,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
28796,TBD,GvH non-permissive,G/G,Poor,6.0,4.0,3.0,2.0,8.0,RIC,1.0,1.0,4.0,2.0,1.0,6.0,1.0,5.0
28797,Poor,GvH non-permissive,G/G,Poor,10.0,8.0,6.0,2.0,10.0,MAC,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
28798,,Permissive mismatched,P/P,Poor,5.0,4.0,3.0,1.0,5.0,NMA,1.0,1.0,3.0,1.0,1.0,4.0,1.0,3.0


In [18]:
# 数値型の列のみを選択
numeric_df_train = df_missing_rate.select_dtypes(include=[np.number])

numeric_df_train

Unnamed: 0,hla_high_res_10,hla_high_res_8,hla_high_res_6,hla_match_dqb1_high,hla_low_res_10,hla_match_c_high,hla_match_a_high,hla_nmdp_6,hla_match_dqb1_low,hla_match_b_high,hla_low_res_8,hla_match_drb1_high,hla_low_res_6
0,,,6.0,2.0,10.0,,2.0,6.0,2.0,2.0,8.0,2.0,6.0
1,10.0,8.0,6.0,2.0,10.0,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
2,10.0,8.0,6.0,2.0,10.0,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
3,10.0,8.0,6.0,2.0,10.0,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
4,10.0,8.0,6.0,2.0,10.0,2.0,2.0,5.0,2.0,2.0,8.0,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,10.0,8.0,6.0,2.0,10.0,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
28796,6.0,4.0,3.0,2.0,8.0,1.0,1.0,4.0,2.0,1.0,6.0,1.0,5.0
28797,10.0,8.0,6.0,2.0,10.0,2.0,2.0,6.0,2.0,2.0,8.0,2.0,6.0
28798,5.0,4.0,3.0,1.0,5.0,1.0,1.0,3.0,1.0,1.0,4.0,1.0,3.0


In [19]:
# 変数間の相関係数を計算
correlation = numeric_df_train.corr()
#変数の相関係数が0.8以上の組み合わせを抽出
high_correlation = correlation[correlation > 0.8]
#high_correlationの組み合わせを表示
high_correlation


Unnamed: 0,hla_high_res_10,hla_high_res_8,hla_high_res_6,hla_match_dqb1_high,hla_low_res_10,hla_match_c_high,hla_match_a_high,hla_nmdp_6,hla_match_dqb1_low,hla_match_b_high,hla_low_res_8,hla_match_drb1_high,hla_low_res_6
hla_high_res_10,1.0,0.986125,0.968703,0.808341,0.929522,0.853098,0.804656,0.886625,,0.872777,0.926091,0.87995,0.912479
hla_high_res_8,0.986125,1.0,0.983351,,0.922255,0.858282,0.822836,0.88325,,0.8838,0.922293,0.879812,0.909537
hla_high_res_6,0.968703,0.983351,1.0,,0.902408,,0.854499,0.870521,,0.884266,0.903878,0.886769,0.899229
hla_match_dqb1_high,0.808341,,,1.0,,,,,,,,,
hla_low_res_10,0.929522,0.922255,0.902408,,1.0,,,0.890334,,,0.98543,0.821292,0.968904
hla_match_c_high,0.853098,0.858282,,,,1.0,,,,,,,
hla_match_a_high,0.804656,0.822836,0.854499,,,,1.0,,,,,,
hla_nmdp_6,0.886625,0.88325,0.870521,,0.890334,,,1.0,,,0.889669,,0.882176
hla_match_dqb1_low,,,,,,,,,1.0,,,,
hla_match_b_high,0.872777,0.8838,0.884266,,,,,,,1.0,,,


In [20]:
for column in high_correlation.columns:
    # 自分自身を除いた相関を取得
    other_corr = high_correlation[column].drop(column)
    max_corr = other_corr.max()
    most_correlated_feature = other_corr.idxmax()  # 最大相関の特徴量を取得
    print(f"{column}と相関が最も高い特徴量: {most_correlated_feature} (相関係数: {max_corr})")


hla_high_res_10と相関が最も高い特徴量: hla_high_res_8 (相関係数: 0.9861252716917301)
hla_high_res_8と相関が最も高い特徴量: hla_high_res_10 (相関係数: 0.9861252716917301)
hla_high_res_6と相関が最も高い特徴量: hla_high_res_8 (相関係数: 0.9833507797663403)
hla_match_dqb1_highと相関が最も高い特徴量: hla_high_res_10 (相関係数: 0.8083406740540625)
hla_low_res_10と相関が最も高い特徴量: hla_low_res_8 (相関係数: 0.9854304122808194)
hla_match_c_highと相関が最も高い特徴量: hla_high_res_8 (相関係数: 0.8582820487584473)
hla_match_a_highと相関が最も高い特徴量: hla_high_res_6 (相関係数: 0.8544986780482423)
hla_nmdp_6と相関が最も高い特徴量: hla_low_res_10 (相関係数: 0.8903341252416828)
hla_match_dqb1_lowと相関が最も高い特徴量: nan (相関係数: nan)
hla_match_b_highと相関が最も高い特徴量: hla_high_res_6 (相関係数: 0.8842655963868669)
hla_low_res_8と相関が最も高い特徴量: hla_low_res_10 (相関係数: 0.9854304122808194)
hla_match_drb1_highと相関が最も高い特徴量: hla_high_res_6 (相関係数: 0.8867687385181369)
hla_low_res_6と相関が最も高い特徴量: hla_low_res_8 (相関係数: 0.9838123742279061)


  most_correlated_feature = other_corr.idxmax()  # 最大相関の特徴量を取得


In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# 先に high_correlation の結果を利用して、各特徴量に対して相方を決定
most_correlated = {}
for column in high_correlation.columns:
    other_corr = high_correlation[column].drop(column).dropna()  # 自分自身を除外し NaN を除く
    if other_corr.empty:
        print(f"{column}: 他の特徴量との相関係数が有効な値ではありません（すべて NaN）。スキップします。")
        continue
    most_correlated_feature = other_corr.idxmax()  # 最大相関の特徴量
    most_correlated[column] = most_correlated_feature

# 欠損値の補完処理
for target_column, related_column in most_correlated.items():
    print(f"補完処理: {target_column} を {related_column} を使って補完中")

    # 欠損値がない行をモデルの訓練データとする
    train_data = df_train.dropna(subset=[target_column, related_column])

    # 欠損値がある行を予測対象データとする
    missing_data = df_train[df_train[target_column].isnull()]

    if missing_data.empty:
        print(f"{target_column} に欠損値はありません。スキップします。")
        continue

    # モデルの訓練
    X_train = train_data[[related_column]]
    y_train = train_data[target_column]

    # 欠損値を補完するためのインプーターを作成
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)

    model = LinearRegression()
    model.fit(X_train_imputed, y_train)

    # 欠損値の補完
    X_missing = missing_data[[related_column]]
    X_missing_imputed = imputer.transform(X_missing)
    predicted_values = model.predict(X_missing_imputed)

    # 補完結果をデータに反映
    df_train.loc[missing_data.index, target_column] = predicted_values

# 結果の確認
print("補完後の欠損値数:")
print(df_train.isnull().sum())



hla_match_dqb1_low: 他の特徴量との相関係数が有効な値ではありません（すべて NaN）。スキップします。
補完処理: hla_high_res_10 を hla_high_res_8 を使って補完中
補完処理: hla_high_res_8 を hla_high_res_10 を使って補完中
補完処理: hla_high_res_6 を hla_high_res_8 を使って補完中
補完処理: hla_match_dqb1_high を hla_high_res_10 を使って補完中
補完処理: hla_low_res_10 を hla_low_res_8 を使って補完中
補完処理: hla_match_c_high を hla_high_res_8 を使って補完中
補完処理: hla_match_a_high を hla_high_res_6 を使って補完中
補完処理: hla_nmdp_6 を hla_low_res_10 を使って補完中
補完処理: hla_match_b_high を hla_high_res_6 を使って補完中
補完処理: hla_low_res_8 を hla_low_res_10 を使って補完中
補完処理: hla_match_drb1_high を hla_high_res_6 を使って補完中
補完処理: hla_low_res_6 を hla_low_res_8 を使って補完中
補完後の欠損値数:
dri_score                     0
psych_disturb                 0
cyto_score                 8068
diabetes                      0
hla_match_c_high              0
hla_high_res_8                0
tbi_status                    0
arrhythmia                    0
hla_low_res_6                 0
graft_type                    0
vent_hist                     0
renal_issue  

In [22]:
#hla_match_dqb1_lowの統計量の表示
df_train['hla_match_dqb1_low'].describe()

count    24606.000000
mean         1.773795
std          0.427130
min          0.000000
25%          2.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: hla_match_dqb1_low, dtype: float64

In [23]:
#hla_match_dqb1_lowを最頻値で補完
mode = df_train['hla_match_dqb1_low'].mode()[0]
df_train['hla_match_dqb1_low'] = df_train['hla_match_dqb1_low'].fillna(mode)


In [24]:
#df_trainの欠損割合の表示
missing_rate_3 = df_train.isnull().sum() / len(df_train)
#多い順に並べ替え
missing_rate_3 = missing_rate_3.sort_values(ascending=False)
missing_rate_3= missing_rate_3[missing_rate_3 > 0]
missing_rate_3


cyto_score_detail         0.413993
tce_div_match             0.395694
tce_imm_match             0.386562
cyto_score                0.280139
conditioning_intensity    0.166285
dtype: float64

In [25]:
#missing_rate_3の列を表示
missing_rate_columns_3 = missing_rate_3.index
df_missing_rate_3 = df_train[missing_rate_columns_3]
df_missing_rate_3

Unnamed: 0,cyto_score_detail,tce_div_match,tce_imm_match,cyto_score,conditioning_intensity
0,,,,,
1,Intermediate,Permissive mismatched,P/P,Intermediate,MAC
2,,Permissive mismatched,P/P,,
3,Intermediate,Permissive mismatched,P/P,Intermediate,MAC
4,,Permissive mismatched,,,MAC
...,...,...,...,...,...
28795,Intermediate,Bi-directional non-permissive,P/P,Favorable,MAC
28796,TBD,GvH non-permissive,G/G,Poor,RIC
28797,Poor,GvH non-permissive,G/G,Poor,MAC
28798,,Permissive mismatched,P/P,Poor,NMA


In [26]:
#cyto_score_detailの統計量の表示
df_train['cyto_score_detail'].describe()
#要素ごとの個数を表示
df_train['cyto_score_detail'].value_counts()

cyto_score_detail
Intermediate    11158
Poor             3323
Favorable        1208
TBD              1043
Not tested        145
Name: count, dtype: int64

In [27]:
#cyto_score_detailを最頻値で補完
mode = df_train['cyto_score_detail'].mode()[0]
df_train['cyto_score_detail'] = df_train['cyto_score_detail'].fillna(mode) 

In [28]:
#tce_div_matchの統計量の表示
df_train['tce_div_match'].describe()
#要素ごとの個数を表示
df_train['tce_div_match'].value_counts()

tce_div_match
Permissive mismatched            12936
GvH non-permissive                2458
HvG non-permissive                1417
Bi-directional non-permissive      593
Name: count, dtype: int64

In [29]:
#tce_div_matchを最頻値で補完
mode = df_train['tce_div_match'].mode()[0]
df_train['tce_div_match'] = df_train['tce_div_match'].fillna(mode)

In [30]:
#tce_imm_matchの統計量の表示
df_train['tce_imm_match'].describe()
#要素ごとの個数を表示
df_train['tce_imm_match'].value_counts()

tce_imm_match
P/P    13114
G/G     2522
H/H     1084
G/B      544
H/B      229
P/H       83
P/B       66
P/G       25
Name: count, dtype: int64

In [31]:
#tce_imm_matchを最頻値で補完
mode = df_train['tce_imm_match'].mode()[0]
df_train['tce_imm_match'] = df_train['tce_imm_match'].fillna(mode)

In [32]:
#cyto_scoreの統計量の表示
df_train['cyto_score'].describe()
#要素ごとの個数を表示
df_train['cyto_score'].value_counts()

cyto_score
Poor            8802
Intermediate    6376
Favorable       3011
TBD             1341
Normal           643
Other            504
Not tested        55
Name: count, dtype: int64

In [33]:
#cyto_scoreを度数を元に作成した確率を用いて補完
prob = df_train['cyto_score'].value_counts(normalize=True)
df_train['cyto_score'] = df_train['cyto_score'].fillna(np.random.choice(prob.index, p=prob.values))

In [34]:
#conditioning_intensityの統計量の表示
df_train['conditioning_intensity'].describe()
#要素ごとの個数を表示
df_train['conditioning_intensity'].value_counts()

conditioning_intensity
MAC                              12288
RIC                               7722
NMA                               3479
TBD                                373
No drugs reported                   87
N/A, F(pre-TED) not submitted       62
Name: count, dtype: int64

データのばらつきが大きい。。最頻値で補完は良くなさそう？
かといって何が良いのかわからないけど一旦

In [35]:
#conditioning_intensityを度数を元に作成した確率を用いて補完
prob = df_train['conditioning_intensity'].value_counts(normalize=True)
df_train['conditioning_intensity'] = df_train['conditioning_intensity'].fillna(np.random.choice(prob.index, p=prob.values))

In [36]:
#train.csvのobject型の列を抽出
object_columns = df_train.select_dtypes(include=object)
object_columns


Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,tbi_status,arrhythmia,graft_type,vent_hist,renal_issue,pulm_severe,...,gvhd_proph,rheum_issue,sex_match,race_group,hepatic_mild,tce_div_match,donor_related,melphalan_dose,cardiac,pulm_moderate
0,N/A - non-malignant indication,No,Poor,No,No TBI,No,Bone marrow,No,No,No,...,FKalone,No,M-F,More than one race,No,Permissive mismatched,Unrelated,"N/A, Mel not given",No,No
1,Intermediate,No,Intermediate,No,"TBI +- Other, >cGy",No,Peripheral blood,No,No,No,...,Other GVHD Prophylaxis,No,F-F,Asian,No,Permissive mismatched,Related,"N/A, Mel not given",No,Yes
2,N/A - non-malignant indication,No,Poor,No,No TBI,No,Bone marrow,No,No,No,...,Cyclophosphamide alone,No,F-M,More than one race,No,Permissive mismatched,Related,"N/A, Mel not given",No,No
3,High,No,Intermediate,No,No TBI,No,Bone marrow,No,No,No,...,FK+ MMF +- others,No,M-M,White,Yes,Permissive mismatched,Unrelated,"N/A, Mel not given",No,No
4,High,No,Poor,No,No TBI,No,Peripheral blood,No,No,No,...,TDEPLETION +- other,No,M-F,American Indian or Alaska Native,No,Permissive mismatched,Related,MEL,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,Intermediate - TED AML case <missing cytogenetics,No,Favorable,No,No TBI,No,Peripheral blood,No,No,No,...,FK+ MTX +- others(not MMF),No,M-F,More than one race,No,Bi-directional non-permissive,Related,"N/A, Mel not given",No,No
28796,High,No,Poor,Yes,No TBI,No,Peripheral blood,No,No,No,...,Cyclophosphamide +- others,No,M-F,Native Hawaiian or other Pacific Islander,No,GvH non-permissive,Related,"N/A, Mel not given",Yes,Yes
28797,TBD cytogenetics,No,Poor,No,No TBI,No,Peripheral blood,No,No,No,...,FK+ MMF +- others,No,M-F,Native Hawaiian or other Pacific Islander,No,GvH non-permissive,Unrelated,"N/A, Mel not given",No,No
28798,N/A - non-malignant indication,No,Poor,No,No TBI,No,Peripheral blood,No,No,No,...,Cyclophosphamide alone,No,M-M,Black or African-American,No,Permissive mismatched,Related,MEL,No,No


In [37]:
# ojbect型をcategory型に変換
def convert_object_to_category(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category')
    return df

df_train = convert_object_to_category(df_train)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28800 entries, 0 to 28799
Data columns (total 56 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   dri_score               28800 non-null  category
 1   psych_disturb           28800 non-null  category
 2   cyto_score              28800 non-null  category
 3   diabetes                28800 non-null  category
 4   hla_match_c_high        28800 non-null  float64 
 5   hla_high_res_8          28800 non-null  float64 
 6   tbi_status              28800 non-null  category
 7   arrhythmia              28800 non-null  category
 8   hla_low_res_6           28800 non-null  float64 
 9   graft_type              28800 non-null  category
 10  vent_hist               28800 non-null  category
 11  renal_issue             28800 non-null  category
 12  pulm_severe             28800 non-null  category
 13  prim_disease_hct        28800 non-null  category
 14  hla_high_res_6        

In [38]:
#データの欠損の確認
missing_rate_4 = df_train.isnull().sum() / len(df_train)
missing_rate_4 = missing_rate_4.sort_values(ascending=False)
missing_rate_4 = missing_rate_4[missing_rate_4 > 0]
missing_rate_4

Series([], dtype: float64)

よくわかんないけど、とりあえず欠損値はなくなった！

In [1]:
from sklearn.preprocessing import StandardScaler

# 数値列だけを抽出
numeric_columns = df_train.select_dtypes(include=['float64', 'int64']).columns
numeric_data = df_train[numeric_columns]

# 正規化 (StandardScalerを使用)
scaler = StandardScaler()
normalized_numeric_data = scaler.fit_transform(numeric_data)

# 正規化されたデータをデータフレームに変換
normalized_numeric_df = pd.DataFrame(normalized_numeric_data, columns=numeric_columns, index=df_train.index)

# 元のカテゴリ変数と結合
df_train_normalized = pd.concat([df_train.drop(columns=numeric_columns), normalized_numeric_df], axis=1)


NameError: name 'df_train' is not defined

In [43]:
#欠損値がなくなり、正規化を行ったdf_trainのデータをCSVファイルに保存
df_train_preprocessed = pd.DataFrame(df_train_normalized, columns=df_train.columns)
df_train_preprocessed.to_csv(r"C:\Users\eikik\python\python_dat\equity-post-HCT-survival-predictions\train_preprocessed.csv", index=False)

テストデータは３つしかないから一旦置いておきます。