In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler  # 使用Min-Max归一化的库

# 读取CSV文件
simulate = pd.read_csv('./credit_data_simulate.csv').dropna()
risk = pd.read_csv('./credit_risk_small_data.csv').dropna()
cyber = pd.read_csv('./cybersecurity_data.csv').dropna()
taiwan_bankrupt = pd.read_csv('./taiwan_bankrupt_data.csv').dropna()
# 分割特征和标签
X_simulate = simulate.iloc[:, :-1]
y_simulate = simulate.iloc[:, -1]
X_risk = risk.iloc[:, :-1]
y_risk = risk.iloc[:, -1]
X_cyber = cyber.iloc[:, :-1]
y_cyber = cyber.iloc[:, -1]
X_taiwan = taiwan_bankrupt.iloc[:, :-1]
y_taiwan = taiwan_bankrupt.iloc[:, -1]

# # 使用MinMaxScaler对数据进行归一化
# scaler = MinMaxScaler()
# X_simulate = scaler.fit_transform(X_simulate)
# X_risk = scaler.fit_transform(X_risk)
# X_cyber = scaler.fit_transform(X_cyber)
# X_taiwan = scaler.fit_transform(X_taiwan)

# 使用t-SNE降维
tsne = TSNE(n_components=2, random_state=42)
X_tsne_simulate = tsne.fit_transform(X_simulate)
X_tsne_risk = tsne.fit_transform(X_risk)
X_tsne_cyber = tsne.fit_transform(X_cyber)
X_tsne_taiwan = tsne.fit_transform(X_taiwan)

# 根据类别将数据分为大类和小类
major_class_simulate = X_tsne_simulate[y_simulate == 1]
minor_class_simulate = X_tsne_simulate[y_simulate == 0]
major_class_risk = X_tsne_risk[y_risk == 0]
minor_class_risk = X_tsne_risk[y_risk == 1]
major_class_cyber = X_tsne_cyber[y_cyber == 0]
minor_class_cyber = X_tsne_cyber[y_cyber == 1]
major_class_taiwan = X_tsne_taiwan[y_taiwan == 0]
minor_class_taiwan = X_tsne_taiwan[y_taiwan == 1]
# 绘制t-SNE可视化
plt.figure(figsize=(24,18))
textprops = {'fontsize': 50}
plt.subplot(2, 2, 1)
plt.scatter(major_class_simulate[:, 0], major_class_simulate[:, 1], color='blue', label='Major class')
plt.scatter(minor_class_simulate[:, 0], minor_class_simulate[:, 1], color='red', label='Minor class')
plt.title('t-SNE credit_simulate.csv',fontsize = 20)
plt.legend(fontsize = 20)


plt.subplot(2, 2, 2)
plt.scatter(major_class_risk[:, 0], major_class_risk[:, 1], color='blue', label='Major class')
plt.scatter(minor_class_risk[:, 0], minor_class_risk[:, 1], color='red', label='Minor class')
plt.title('t-SNE credit_risk_small',fontsize = 20)
plt.legend(fontsize = 20)


plt.subplot(2, 2, 3)
plt.scatter(major_class_cyber[:, 0], major_class_cyber[:, 1], color='blue', label='Major class')
plt.scatter(minor_class_cyber[:, 0], minor_class_cyber[:, 1], color='red', label='Minor class')
plt.title('t-SNE cybersecurity',fontsize = 20)
plt.legend(fontsize = 20)

plt.subplot(2, 2, 4)
plt.scatter(major_class_taiwan[:, 0], major_class_taiwan[:, 1], color='blue', label='Major class')
plt.scatter(minor_class_taiwan[:, 0], minor_class_taiwan[:, 1], color='red', label='Minor class')
plt.title('t-SNE taiwan_bankrupt',fontsize = 20)
plt.legend(fontsize = 20)

plt.show()


In [None]:
from sklearn.model_selection import train_test_split
# 将数据集划分为训练集和测试集
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 统计训练集和测试集中每个类别的样本数量
class_counts_simulate = y_simulate.value_counts()
class_counts_risk = y_risk.value_counts()
class_counts_cyber = y_cyber.value_counts()
class_counts_taiwan = y_taiwan.value_counts()
# 绘制训练集和测试集中两类的分布饼状图
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2, 2, figsize=(42, 24))
textprops = {'fontsize': 50}
ax1.pie(class_counts_simulate, labels=class_counts_simulate.index, autopct='%1.1f%%', startangle=90,textprops=textprops)
ax1.set_title('Class Distribution in simulate Set', fontsize=50)
ax1.axis('equal')

ax2.pie(class_counts_risk, labels=class_counts_risk.index, autopct='%1.1f%%', startangle=90,textprops=textprops)
ax2.set_title('Class Distribution in risk Set', fontsize=50)
ax2.axis('equal')

ax3.pie(class_counts_cyber, labels=class_counts_cyber.index, autopct='%1.1f%%', startangle=90,textprops=textprops)
ax3.set_title('Class Distribution in cyber Set', fontsize=50)
ax3.axis('equal')

ax4.pie(class_counts_taiwan, labels=class_counts_taiwan.index, autopct='%1.1f%%', startangle=90,textprops=textprops)
ax4.set_title('Class Distribution in taiwan_bankrupt Set', fontsize=50)
ax4.axis('equal')
plt.tight_layout()
plt.show()