In [1]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTEENN
from sklearn.utils import shuffle
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(threshold=np.inf)
pd.set_option('display.width',None) 
pd.set_option('display.max_columns', None)
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['axes.unicode_minus'] = False
from sklearn.utils import resample
import warnings 
warnings.filterwarnings ('ignore') 
import random
from sklearn.utils import shuffle

In [2]:
df_train_result=pd.read_csv("./forvisualization0622.csv",low_memory=False,encoding="utf-8")
print(df_train_result.shape)

(25709, 17)


In [3]:
X = df_train_result.drop('LABEL', axis=1)
y = df_train_result['LABEL']

In [4]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 25709 entries, 0 to 25708
Series name: LABEL
Non-Null Count  Dtype
--------------  -----
25709 non-null  int64
dtypes: int64(1)
memory usage: 201.0 KB


In [5]:
X.columns = ['BMI', 'Age', 'Height','Chronic Hypertension','Morning Sickness',
             'History of Preeclampsia','Gravidity','Pre-gestational Diabetes','Multiple Pregnancy',
            'Menstrual Regularity','Previous Abortion','Scarred Uterus','Stillbirth','Family History of Hypertension',
            'Chronic Renal Disease','Assisted Reproductive Technology']

In [6]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

def calculate_ibi3(k, data, pos_data, neg_data, rr, pos_num, label):
    nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='ball_tree').fit(data)
    distances, knn_idx = nbrs.kneighbors(pos_data)

    fn = np.zeros(pos_num)
    fp = np.zeros(pos_num)
    f_prime_p = np.zeros(pos_num)
    knn_idx = np.delete(knn_idx, 0, 1)
    for i in range(pos_num):
        M = np.isin(knn_idx[i], neg_data.index).sum()
        if M == 0:
            dist = pairwise_distances(pos_data.iloc[i].values.reshape(1, -1), data).reshape(-1)
            sort_idx = np.argsort(dist)
            nearest_pos_idx = sort_idx[label[sort_idx] == 1][1]
            M = np.isin(sort_idx[:nearest_pos_idx], neg_data.index).sum()
            k = M + 1
        # else:
        #     k = k0
        fn[i] = M / k
        fp[i] = (k - M) / k
        f_prime_p[i] = rr * (k - M) / k
    ibi3 = f_prime_p / (fn + f_prime_p) - fp / (fn + fp)

    return ibi3

def imbalance_impact_knn1(data, label, k):
    pos_num = int((label == 1).sum())
    neg_num = int((1 - label).sum())

    pos_data = data.loc[label == 1]
    neg_data = data.loc[label == 0]
    rr = neg_num / pos_num

    ibi3_values = calculate_ibi3(k, data, pos_data, neg_data, rr, pos_num, label)

    bi3 = np.mean(ibi3_values)
    return ibi3_values, bi3


In [5]:
from sklearn.mixture import GaussianMixture
import pandas as pd
import numpy as np

def calculate_alpha(alpha, X, y):
    n_components = 15 
    gmm = GaussianMixture(n_components=n_components, random_state=42)
    gmm.fit(X[y == 1])
    weights = gmm.weights_
    covariances = gmm.covariances_
    means = gmm.means_
    
    inverse_weights = 1 / weights**alpha
    inverse_weights /= np.sum(inverse_weights)
    gmm.weights_=inverse_weights
    num_positive_samples = int(np.sum(y == 1) * (1 - min(inverse_weights)))
    np.random.seed(42)
    new_positive_samples = gmm.sample(num_positive_samples)[0]
    X_positive_resampled = np.vstack((X[y == 1].values, new_positive_samples))
    y_positive_resampled = np.ones(X_positive_resampled.shape[0])
    X_resampled1 = np.vstack((X[y == 0].values, X_positive_resampled))
    y_resampled1 = np.hstack((y[y == 0].values, y_positive_resampled))
    
    X_resampled_df = pd.DataFrame(X_resampled1)
    y_resampled_series = pd.Series(y_resampled1)
    aRibi3, aRbi3 = imbalance_impact_knn1(X_resampled_df, y_resampled_series,5)
    print(alpha,aRbi3)
    return alpha, aRbi3
alphas = np.arange(0.1, 1.91, 0.01)
# 遍历所有的alpha值并计算aRbi3
results = [calculate_alpha(alpha, X, y) for alpha in alphas]
# 找到aRbi3最小的alpha值
min_aRbi3 = min(results, key=lambda x: x[1])
print('最低的aRbi3为：', min_aRbi3[1])
print('对应的alpha为：', min_aRbi3[0])

#### 计算GMM NUM 

# 采样处理

## alpha-IW-GMM+RUS

In [11]:
# 数据预处理 - 标准化
# Inverse Weighted-GMM
n_components = 15 # 设置高斯混合模型的组件数量，可以根据您的数据集进行调整
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X[y == 1])
weights = gmm.weights_
covariances = gmm.covariances_
means = gmm.means_
alpha=1.84
inverse_weights = 1 / weights**alpha
inverse_weights /= np.sum(inverse_weights)
gmm.weights_=inverse_weights
# 使用GMM生成新的阳性样本
num_positive_samples = int(np.sum(y == 1) * (1 - min(inverse_weights)))
np.random.seed(42)
new_positive_samples = gmm.sample(num_positive_samples)[0]
# 将新的阳性样本添加到原始数据集
X_positive_resampled = np.vstack((X[y == 1].values, new_positive_samples))
y_positive_resampled = np.ones(X_positive_resampled.shape[0])
X_resampled1 = np.vstack((X[y == 0].values, X_positive_resampled))
y_resampled1 = np.hstack((y[y == 0].values, y_positive_resampled))

from imblearn.under_sampling import RandomUnderSampler

# 计算每个类别的样本数量
unique_classes, counts = np.unique(y_resampled1, return_counts=True)

# 假设类别0是阴性，类别1是阳性
count_class_0 = counts[np.where(unique_classes == 0)[0][0]]
count_class_1 = counts[np.where(unique_classes == 1)[0][0]]
undersample = RandomUnderSampler(sampling_strategy={0: count_class_1*3, 1: count_class_1}, random_state=42)

aX_resampled, ay_resampled = undersample.fit_resample(X_resampled1, y_resampled1)
aX_resampled, ay_resampled = shuffle(aX_resampled, ay_resampled, random_state=42)


## IW-GMM+RUS

In [12]:
n_components = 15 
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X[y == 1])
weights = gmm.weights_
covariances = gmm.covariances_
means = gmm.means_
inverse_weights = 1 / weights
inverse_weights /= np.sum(inverse_weights)
gmm.weights_=inverse_weights
num_positive_samples = int(np.sum(y == 1) * (1 - min(inverse_weights)))
np.random.seed(42)
new_positive_samples = gmm.sample(num_positive_samples)[0]
X_positive_resampled = np.vstack((X[y == 1].values, new_positive_samples))
y_positive_resampled = np.ones(X_positive_resampled.shape[0])
X_resampled1 = np.vstack((X[y == 0].values, X_positive_resampled))
y_resampled1 = np.hstack((y[y == 0].values, y_positive_resampled))

from imblearn.under_sampling import RandomUnderSampler

# 计算每个类别的样本数量
unique_classes, counts = np.unique(y_resampled1, return_counts=True)

# 假设类别0是阴性，类别1是阳性
count_class_0 = counts[np.where(unique_classes == 0)[0][0]]
count_class_1 = counts[np.where(unique_classes == 1)[0][0]]
undersample = RandomUnderSampler(sampling_strategy={0: count_class_1*3, 1: count_class_1}, random_state=42)

X_resampled, y_resampled = undersample.fit_resample(X_resampled1, y_resampled1)

## SMOTENN

In [17]:
from imblearn.combine import SMOTEENN
# SMOTEENN
sampling_strategy = 1/3 
smoteenn = SMOTEENN(random_state=42,sampling_strategy=sampling_strategy)
X_resampled_SMN, y_resampled_SMN = smoteenn.fit_resample(X, y)

## IW-GMM+SMOTENN

In [18]:
n_components = 15 # 
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X[y == 1])
weights = gmm.weights_
covariances = gmm.covariances_
means = gmm.means_
inverse_weights = 1 / weights
inverse_weights /= np.sum(inverse_weights)
gmm.weights_=inverse_weights

num_positive_samples = int(np.sum(y == 1) * (1 - min(inverse_weights)))
np.random.seed(42)
new_positive_samples = gmm.sample(num_positive_samples)[0]
X_positive_resampled = np.vstack((X[y == 1].values, new_positive_samples))
y_positive_resampled = np.ones(X_positive_resampled.shape[0])
X_resampled1 = np.vstack((X[y == 0].values, X_positive_resampled))
y_resampled1 = np.hstack((y[y == 0].values, y_positive_resampled))

from imblearn.combine import SMOTEENN
sampling_strategy = 1/3
smoteenn = SMOTEENN(random_state=42,sampling_strategy=sampling_strategy)
X_resampled_GSMN, y_resampled_GSMN = smoteenn.fit_resample(X_resampled1, y_resampled1)

## gmm+RUS 

In [13]:
# GMM
n_components = 15 
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(X[y == 1])
weights = gmm.weights_
num_positive_samples = int(np.sum(y == 1)* (1 - min(weights)))  
np.random.seed(42)
new_positive_samples = gmm.sample(num_positive_samples)[0]
X_positive_resampled = np.vstack((X[y == 1].values, new_positive_samples))
y_positive_resampled = np.ones(X_positive_resampled.shape[0])
X_resampled = np.vstack((X[y == 0].values, X_positive_resampled))
y_resampled = np.hstack((y[y == 0].values, y_positive_resampled))
unique_classes, counts = np.unique(y_resampled, return_counts=True)
count_class_0 = counts[np.where(unique_classes == 0)[0][0]]
count_class_1 = counts[np.where(unique_classes == 1)[0][0]]
undersample = RandomUnderSampler(sampling_strategy={0: count_class_1*3, 1: count_class_1}, random_state=42)
X_resampled_rus, y_resampled_rus = undersample.fit_resample(X_resampled, y_resampled)

# 计算IBI3

In [5]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import multiprocessing as mp

def calculate_pm(k, data, pos_data, neg_data, rr, pos_num, label):
    nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm='ball_tree').fit(data)
    _, knn_idx = nbrs.kneighbors(pos_data)

    p2 = np.isin(knn_idx[:, 1:], neg_data.index).sum(axis=1) / k
    p2old = p2.copy()
    zero_indices = np.where(p2 == 0)[0]
    if len(zero_indices) > 0:
        dist = pairwise_distances(pos_data.iloc[zero_indices].values, data)
        sort_idx = np.argsort(dist, axis=1)
        nearest_pos = np.array([idx[label[idx] == 1][1] for idx in sort_idx])
        p2[zero_indices] = nearest_pos / (nearest_pos + 1)
    p1 = 1 - p2
    px = (rr * p1 / (p2 + rr * p1) - p1)
    pm = np.mean(px)
    return pm

def imbalance_impact_knn(data, label):
    pos_num = int((label == 1).sum())
    neg_num = int((1 - label).sum())

    pos_data = data.loc[label == 1]
    neg_data = data.loc[label == 0]
    rr = neg_num / pos_num

    with mp.Pool(mp.cpu_count()) as pool:
        pm_values = pool.starmap(calculate_pm, [(k, data, pos_data, neg_data, rr, pos_num, label) for k in range(2, 51)])

    derivative = np.diff(pm_values)
    best_k = np.argmax(np.abs(derivative[1:])) + 3 

    pm_best_k = calculate_pm(best_k, data, pos_data, neg_data, rr, pos_num, label)
    return pm_values, pm_best_k, best_k


In [None]:
ibi3, bi3,ork = imbalance_impact_knn(X, y)
print(bi3,ork)

In [16]:
X_resampled_df = pd.DataFrame(aX_resampled)
y_resampled_series = pd.Series(ay_resampled)
aRibi3, aRbi3,ork = imbalance_impact_knn(X_resampled_df, y_resampled_series)
print(aRbi3,ork)

0.10121842557896309 3


In [18]:
# Convert numpy arrays to pandas DataFrame/Series
X_resampled_df = pd.DataFrame(X_resampled)
y_resampled_series = pd.Series(y_resampled)
# 计算 IBI3 值
Ribi3, Rbi3 ,ork = imbalance_impact_knn(X_resampled_df, y_resampled_series)
print(Rbi3,ork)

0.15581860471365178 3


In [36]:
# Convert numpy arrays to pandas DataFrame/Series
X_resampled_df = pd.DataFrame(X_resampled_SMN)
y_resampled_series = pd.Series(y_resampled_SMN)

# 计算 IBI3 值
sRibi3, sRbi3 ,ork = imbalance_impact_knn(X_resampled_df, y_resampled_series)
print(sRbi3,ork)

0.05987920491522192 5


In [37]:
X_resampled_df = pd.DataFrame(X_resampled_GSMN)
y_resampled_series = pd.Series(y_resampled_GSMN)
GSibi3, GSbi3,ork = imbalance_impact_knn(X_resampled_df, y_resampled_series)
print(GSbi3,ork)

0.03616099294273061 3


In [30]:
X_resampled_df = pd.DataFrame(X_resampled_rus)
y_resampled_series = pd.Series(y_resampled_rus)
RUSibi3, RUSbi3,ork = imbalance_impact_knn(X_resampled_df, y_resampled_series)
print(RUSbi3,ork)

0.13481601146817068 3


## 显示数值

## 对比多种采样的效果

In [1]:
sns.set_style("white")
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams.update({'font.size': 8})
fig, ax = plt.subplots(figsize=(5, 4), dpi=600)

sns.kdeplot(ibi3, color='grey',linewidth=2, linestyle='-', alpha=0.5, label=f'Original dataset ibi3: {np.mean(ibi3):.4f}({ibi3.var():.2f})')
sns.kdeplot(aRibi3, color='red', linewidth=1, linestyle='-', alpha=0.9,label=f'after α-IW-GMM+RUS ibi3: {np.mean(aRibi3):.4f}({aRibi3.var():.2f})')
sns.kdeplot(Ribi3, color='green', linewidth=1, linestyle='--', alpha=0.7,label=f'after IW-GMM+RUS ibi3: {np.mean(Ribi3):.4f}({Ribi3.var():.2f})')
sns.kdeplot(RUSibi3, color='blue',linewidth=1, linestyle='--', alpha=0.7,  label=f'after GMM+RUS ibi3: {np.mean(RUSibi3):.4f}({RUSibi3.var():.2f})')
plt.xlim(0,)  

plt.legend()
plt.show()

In [2]:
print(ibi3.var(),aRibi3.var(),Ribi3.var(),RUSibi3.var())

In [3]:
X1=X
y1=y
X2=aX_resampled
X3=X_resampled
X4=X_resampled_rus
y2=ay_resampled
y3=y_resampled
y4=y_resampled_rus

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

datasets_X = [X1, X2, X3, X4]
datasets_y = [y1, y2, y3, y4]

plt.figure(figsize=(10, 8),dpi=600)

for i, (X, y) in enumerate(zip(datasets_X, datasets_y)):
    cv = StratifiedKFold(n_splits=5)

    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    closest_tpr = []
    min_distance = float('inf')

    for train, test in cv.split(X, y):
        classifier = RandomForestClassifier()
        probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
        fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        tprs.append(np.interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        aucs.append(roc_auc)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)

    for tpr in tprs:
        distance = np.linalg.norm(mean_tpr - tpr)
        if distance < min_distance:
            min_distance = distance
            closest_tpr = tpr

    plt.plot(mean_fpr, closest_tpr, label='Dataset {0} ROC (area = {1:0.4f})'.format(i+1, mean_auc))

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()




In [4]:
import umap

In [None]:
from sklearn.cluster import KMeans

umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X)
mask1 = X_umap[:, 0] <= 10
X_umap_filtered = X_umap[mask1]
y_filtered = y[mask1]

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_umap_filtered)
from matplotlib import cm

mask1_pos = mask1[y==1]
ibi3_filtered = ibi3[mask1_pos]


ibi3_normalized = (ibi3_filtered - np.min(ibi3_filtered)) / (np.max(ibi3_filtered) - np.min(ibi3_filtered))

cmap = cm.get_cmap('coolwarm') 

colors = cmap(ibi3_normalized)

plt.figure(figsize=(9,6),dpi=600) 

sm = cm.ScalarMappable(cmap=cmap)
sm.set_array(colors)

for cluster in np.unique(clusters):
    for label in np.unique(y_filtered):
        mask = (clusters == cluster) & (y_filtered == label)
        if label == 1:
            marker = '+'
            markersize = 30  
            alpha = 0.8  
            mask_pos_samples = mask[y_filtered == 1]  
            color = colors[mask_pos_samples]  
            plt.scatter(X_umap_filtered[mask, 0], X_umap_filtered[mask, 1], marker=marker, alpha=alpha, s=markersize, color=color)
        if label == 0:
            if cluster == 0:
                marker = '.'
                markersize = 2  
                alpha = 0.3  
                color = 'silver'  
            if cluster == 1:
                marker = '.'
                markersize = 2 
                alpha = 0.3  
                color = 'silver'
            plt.scatter(X_umap_filtered[mask, 0], X_umap_filtered[mask, 1], marker=marker, alpha=alpha, s=markersize, color=color)
cbar = plt.colorbar(sm, drawedges=False)
cbar.outline.set_visible(False)  
plt.xticks([])
plt.yticks([])
plt.show()



In [None]:
X_resampled_umap = umap.transform(X_resampled)
mask2 = X_resampled_umap[:, 0] <= 10
X_resampled_umap_filtered = X_resampled_umap[mask2]
y_resampled_filtered = y_resampled[mask2]

clusters_resampled = kmeans.predict(X_resampled_umap_filtered)

mask2_pos = mask2[y_resampled==1]
Ribi3_filtered = Ribi3[mask2_pos]


Ribi3_normalized = (Ribi3_filtered - np.min(Ribi3_filtered)) / (np.max(Ribi3_filtered) - np.min(Ribi3_filtered))

cmap = cm.get_cmap('coolwarm')  

colors = cmap(Ribi3_normalized)

plt.figure(figsize=(9,6),dpi=600)  

sm = cm.ScalarMappable(cmap=cmap)
sm.set_array(colors)


for cluster in np.unique(clusters_resampled):
    for label in np.unique(y_resampled_filtered):
        mask = (clusters_resampled == cluster) & (y_resampled_filtered == label)
        if label == 1:
            marker = '+'
            markersize = 30  
            alpha = 0.8  
            mask_pos_samples = mask[y_resampled_filtered == 1]  
            color = colors[mask_pos_samples]  
            plt.scatter(X_resampled_umap_filtered[mask, 0], X_resampled_umap_filtered[mask, 1], marker=marker, alpha=alpha, s=markersize, color=color)
        if label == 0:
            if cluster == 0:
                marker = '.'
                markersize = 2  
                alpha = 0.3  
                color = 'silver'  
            if cluster == 1:
                marker = '.'
                markersize = 2  
                alpha = 0.3  
                color = 'silver'
            plt.scatter(X_resampled_umap_filtered[mask, 0], X_resampled_umap_filtered[mask, 1], marker=marker, alpha=alpha, s=markersize, color=color)
cbar = plt.colorbar(sm, drawedges=False)
cbar.outline.set_visible(False)  
plt.xticks([])
plt.yticks([])
plt.show()
