In [8]:
import pandas as pd

# 加载数据
df_normal = pd.read_excel("./excel/2010-2016单胎正常样本（脱敏）.xlsx")
df_abnormal = pd.read_excel("./excel/2010-2016早流(异常)样本(脱敏).xlsx")

In [9]:
df_normal['妊娠结果'] = 0  # 0 表示正常
df_abnormal['妊娠结果'] = 1  # 1 表示异常
df = pd.concat([df_normal, df_abnormal], ignore_index=True)

In [10]:
selected_features = ['年龄', 'em', 'Ⅲ线', '助孕方式', '1ET天', '孕囊', '胚芽', '卵黄囊', '胎心', '妊娠结果']
df_selected = df[selected_features].copy()

In [11]:
# Ⅲ线独热编码
iii_line_mapping = {
    'Ⅲ线_Ⅰ': 0, 'Ⅲ线_Ⅱ': 1, 'Ⅲ线_Ⅱ-Ⅲ': 2, 'Ⅲ线_Ⅲ': 3, 'Ⅲ线_有积液': 4, 'Ⅲ线_欠清': 5
}
df_selected['Ⅲ线'] = df_selected['Ⅲ线'].map(iii_line_mapping)
df_iii_line = pd.get_dummies(df_selected['Ⅲ线'], prefix='Ⅲ线')
df_selected = pd.concat([df_selected, df_iii_line], axis=1)
df_selected.drop('Ⅲ线', axis=1, inplace=True)

# 助孕方式独热编码
assistance_mapping = {
    '鲜胚': 0, '冻胚': 1
}
df_selected['助孕方式'] = df_selected['助孕方式'].map(assistance_mapping)
df_assistance = pd.get_dummies(df_selected['助孕方式'], prefix='助孕方式')
df_selected = pd.concat([df_selected, df_assistance], axis=1)
df_selected.drop('助孕方式', axis=1, inplace=True)

df_selected.head()

Unnamed: 0,年龄,em,1ET天,孕囊,胚芽,卵黄囊,胎心,妊娠结果,助孕方式_1.0
0,34,无,28,23X15,5.6,5.1,有,0,False
1,33,无,28,24X16,4.4,4.1,有,0,False
2,27,无,28,26X12,3.8,3.6,有,0,True
3,28,无,31,26X20,5.6,4.9,有,0,False
4,29,无,34,37X22,8.8,4.7,有,0,False


In [16]:
from sklearn.linear_model import LinearRegression
import numpy as np

df_selected.replace(r'[^\d.]+', '', regex=True)

df_selected['年龄'] = df_selected['年龄'].replace(r'[^\d.]+', '', regex=True).astype(float)
df_selected['em'] = df_selected['em'].replace('无', '0').astype(float)
df_selected['1ET天'] = df_selected['1ET天'].replace(r'[^\d.]+', '', regex=True).replace('', 0).astype(float)

# 提取正常样本的胎心数据
normal_heartbeat = df_selected[df_selected['妊娠结果'] == 0]['胎心'].replace(r'\s+', '', regex=True).replace('有', '1').replace('无', '0').values.reshape(-1, 1)

# 提取正常样本的其他特征作为自变量
normal_features = (df_selected[df_selected['妊娠结果'] == 0][['年龄', 'em', '1ET天']]
                   .values)

# 训练线性回归模型
model = LinearRegression()
model.fit(normal_features[~pd.isnull(normal_heartbeat).flatten()],
          normal_heartbeat[~pd.isnull(normal_heartbeat).flatten()])

# 预测缺失值
missing_heartbeat_index = pd.isnull(normal_heartbeat).flatten()
predicted_heartbeat = model.predict(normal_features[missing_heartbeat_index])

# 填补缺失值
df_selected.loc[df_selected['妊娠结果'] == 0, '胎心'] = df_selected.loc[df_selected['妊娠结果'] == 0, '胎心'].fillna(
    pd.Series(predicted_heartbeat.flatten()))

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
import numpy as np

# 提取异常样本的胎心数据
abnormal_heartbeat = df_selected[df_selected['妊娠结果'] == 1]['胎心'].values

# 提取异常样本的胎心非缺失值
abnormal_heartbeat_notnull = abnormal_heartbeat[~pd.isnull(abnormal_heartbeat)]

# 计算缺失值数量
missing_count = pd.isnull(abnormal_heartbeat).sum()

# 分层随机抽样填补缺失值
filled_values = np.random.choice(abnormal_heartbeat_notnull, size=missing_count)
df_selected.loc[df_selected['妊娠结果'] == 1, '胎心'] = df_selected.loc[df_selected['妊娠结果'] == 1, '胎心'].fillna(
    pd.Series(filled_values))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 可视化特征分布
sns.pairplot(df_selected, hue='妊娠结果')
plt.show()

# 计算特征之间的相关性
correlation_matrix = df_selected.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()