# 使用AutoEncoder降维

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
# 定义 AutoEncoder 网络
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(True),
            nn.Linear(1024, encoding_dim),
            nn.ReLU(True)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 1024),
            nn.ReLU(True),
            nn.Linear(1024, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

# 定义训练函数
def train(model, dataloader, num_epochs, learning_rate):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for data in dataloader:
            inputs, _ = data
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()

        # 打印损失
        if epoch % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))





In [2]:
def min_max_normalize(tensor):
    min_val = tensor.min()
    max_val = tensor.max()
    normalized_tensor = (tensor - min_val) / (max_val - min_val)
    return normalized_tensor, min_val, max_val

def min_max_denormalize(normalized_tensor, min_val, max_val):
    denormalized_tensor = normalized_tensor * (max_val - min_val) + min_val
    return denormalized_tensor

In [3]:
def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     torch.cuda.manual_seed(seed)
     np.random.seed(seed)
     torch.backends.cudnn.deterministic = True

In [4]:
setup_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
# 加载数据
data = pd.read_csv('GSE20194_merge.csv')
data = data[data['group'] == 0]
data.pop('group')
data_tensor = torch.from_numpy(data.values).to(torch.float32).to(device)
nor_data_tensor, ae_min,ae_max = min_max_normalize(data_tensor)
print(nor_data_tensor)
dataset = TensorDataset(nor_data_tensor,nor_data_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 创建 AutoEncoder 模型
input_dim = 22283
encoding_dim = 512  # 降维后的维度
model = AutoEncoder(input_dim, encoding_dim).to(device)

# 训练模型
num_epochs = 10
learning_rate = 0.0001


tensor([[0.6650, 0.5107, 0.4731,  ..., 0.2427, 0.2466, 0.2064],
        [0.6187, 0.5577, 0.4205,  ..., 0.2052, 0.2323, 0.2133],
        [0.6538, 0.4788, 0.4836,  ..., 0.2346, 0.2912, 0.2595],
        ...,
        [0.6410, 0.5668, 0.4387,  ..., 0.2674, 0.2398, 0.2291],
        [0.6574, 0.4747, 0.4780,  ..., 0.3744, 0.3257, 0.2875],
        [0.6593, 0.4650, 0.4086,  ..., 0.2413, 0.3710, 0.2600]],
       device='cuda:0')


In [6]:
dataset

<torch.utils.data.dataset.TensorDataset at 0x7fd144343a90>

In [7]:
from pathlib import Path
model_file_name = 'GSE20194_AE.pth'
model_file = Path(model_file_name)
if model_file.exists():
    # 指定的文件存在
    print(f'{model_file_name}:read model params!')
    model.load_state_dict(torch.load(model_file_name))
else:
    print(f'{model_file_name}:not exist!')

GSE20194_AE.pth:read model params!


In [8]:
train(model, dataloader, 1000, learning_rate)

Epoch [1/1000], Loss: 0.0134
Epoch [101/1000], Loss: 0.0021
Epoch [201/1000], Loss: 0.0018
Epoch [301/1000], Loss: 0.0016
Epoch [401/1000], Loss: 0.0014
Epoch [501/1000], Loss: 0.0012
Epoch [601/1000], Loss: 0.0010
Epoch [701/1000], Loss: 0.0009
Epoch [801/1000], Loss: 0.0007
Epoch [901/1000], Loss: 0.0008


In [8]:
# 使用训练好的模型对数据进行降维
encoded_data = model.encoder(nor_data_tensor)

In [9]:
nor_data_tensor

tensor([[0.6650, 0.5107, 0.4731,  ..., 0.2427, 0.2466, 0.2064],
        [0.6187, 0.5577, 0.4205,  ..., 0.2052, 0.2323, 0.2133],
        [0.6538, 0.4788, 0.4836,  ..., 0.2346, 0.2912, 0.2595],
        ...,
        [0.6410, 0.5668, 0.4387,  ..., 0.2674, 0.2398, 0.2291],
        [0.6574, 0.4747, 0.4780,  ..., 0.3744, 0.3257, 0.2875],
        [0.6593, 0.4650, 0.4086,  ..., 0.2413, 0.3710, 0.2600]],
       device='cuda:0')

In [10]:
data_tensor

tensor([[12.1237,  8.7101,  7.8795,  ...,  2.7827,  2.8683,  1.9794],
        [11.0980,  9.7493,  6.7152,  ...,  1.9527,  2.5523,  2.1316],
        [11.8757,  8.0035,  8.1103,  ...,  2.6036,  3.8544,  3.1530],
        ...,
        [11.5913,  9.9503,  7.1183,  ...,  3.3294,  2.7174,  2.4806],
        [11.9540,  7.9140,  7.9862,  ...,  5.6962,  4.6191,  3.7743],
        [11.9979,  7.7001,  6.4521,  ...,  2.7505,  5.6207,  3.1651]],
       device='cuda:0')

In [11]:
encoded_data

tensor([[1.2239, 1.5627, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.7487, 1.8209, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.8622, 1.2694, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [1.2293, 1.7510, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [1.5951, 0.7724, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [2.5384, 1.1817, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)

In [12]:
model.decoder(encoded_data)

tensor([[0.6791, 0.5121, 0.4758,  ..., 0.2429, 0.3030, 0.2372],
        [0.6220, 0.5610, 0.4084,  ..., 0.2332, 0.2191, 0.2064],
        [0.6579, 0.4779, 0.4621,  ..., 0.2542, 0.3092, 0.2532],
        ...,
        [0.6447, 0.5388, 0.4544,  ..., 0.2225, 0.2562, 0.2234],
        [0.6345, 0.4721, 0.4656,  ..., 0.2968, 0.3223, 0.2650],
        [0.6566, 0.4712, 0.4105,  ..., 0.2574, 0.3918, 0.2325]],
       device='cuda:0', grad_fn=<SigmoidBackward0>)

In [14]:
decoder_data_tensor = min_max_denormalize(model.decoder(encoded_data),ae_min,ae_max)
decoder_data_tensor

tensor([[12.4342,  8.7421,  7.9385,  ...,  2.7860,  4.1166,  2.6613],
        [11.1724,  9.8223,  6.4471,  ...,  2.5730,  2.2602,  1.9801],
        [11.9670,  7.9854,  7.6356,  ...,  3.0362,  4.2534,  3.0137],
        ...,
        [11.6732,  9.3309,  7.4641,  ...,  2.3362,  3.0812,  2.3547],
        [11.4491,  7.8553,  7.7126,  ...,  3.9801,  4.5421,  3.2760],
        [11.9367,  7.8369,  6.4934,  ...,  3.1080,  6.0812,  2.5570]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [16]:
data_tensor, decoder_data_tensor

(tensor([[12.1237,  8.7101,  7.8795,  ...,  2.7827,  2.8683,  1.9794],
         [11.0980,  9.7493,  6.7152,  ...,  1.9527,  2.5523,  2.1316],
         [11.8757,  8.0035,  8.1103,  ...,  2.6036,  3.8544,  3.1530],
         ...,
         [11.5913,  9.9503,  7.1183,  ...,  3.3294,  2.7174,  2.4806],
         [11.9540,  7.9140,  7.9862,  ...,  5.6962,  4.6191,  3.7743],
         [11.9979,  7.7001,  6.4521,  ...,  2.7505,  5.6207,  3.1651]],
        device='cuda:0'),
 tensor([[12.4342,  8.7421,  7.9385,  ...,  2.7860,  4.1166,  2.6613],
         [11.1724,  9.8223,  6.4471,  ...,  2.5730,  2.2602,  1.9801],
         [11.9670,  7.9854,  7.6356,  ...,  3.0362,  4.2534,  3.0137],
         ...,
         [11.6732,  9.3309,  7.4641,  ...,  2.3362,  3.0812,  2.3547],
         [11.4491,  7.8553,  7.7126,  ...,  3.9801,  4.5421,  3.2760],
         [11.9367,  7.8369,  6.4934,  ...,  3.1080,  6.0812,  2.5570]],
        device='cuda:0', grad_fn=<AddBackward0>))

In [20]:
np.savez('GSE20194_AutoEncoder.npz', origin_data=data_tensor.cpu().detach().numpy(), decoder_data=decoder_data_tensor.cpu().detach().numpy())

In [15]:
torch.save(model.state_dict(), model_file_name)

# 生成对抗网络

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

# 定义生成器（Generator）
class Generator(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, output_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)

# 定义判别器（Discriminator）
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

# 超参数设置
input_dim = 64
data_dim = 512
lr = 0.00001
epochs = 1000
batch_size = 64

# 初始化生成器和判别器
generator = Generator(input_dim, data_dim).to(device)
discriminator = Discriminator(data_dim).to(device)

# 设置优化器
g_optimizer = optim.Adam(generator.parameters(), lr=lr)
d_optimizer = optim.Adam(discriminator.parameters(), lr=lr)

# 设置损失函数
loss_func = nn.BCELoss()



In [17]:
encoded_data.shape

torch.Size([56, 512])

In [18]:
gan_dataset = TensorDataset(encoded_data,encoded_data)
gan_dataloader = DataLoader(gan_dataset, batch_size=batch_size, shuffle=True)

In [20]:
discriminator.train()
generator.train()
min_loss = 1000
gan_model_name = 'generator_pCR.pth'
for epoch in range(epochs):
    for data in gan_dataloader:
        # print("********")
        real_data, _ = data
        real_data = real_data.to(device)
        # 训练判别器
        d_optimizer.zero_grad()
        real_label = torch.ones(real_data.shape[0], 1).to(device)

        fake_data = generator(torch.randn(real_data.shape[0], input_dim).to(device)).detach()
        fake_label = torch.zeros(real_data.shape[0], 1).to(device)
        real_out = discriminator(real_data)
        fake_out = discriminator(fake_data)
        real_loss = loss_func(real_out, real_label)
        fake_loss = loss_func(fake_out, fake_label)
        d_loss = real_loss + fake_loss
        d_loss.backward(retain_graph=True)
        d_optimizer.step()

        # 训练生成器
        g_optimizer.zero_grad()
        gen_input = torch.randn(real_data.shape[0], input_dim).to(device)
        gen_output = generator(gen_input)
        dis_output = discriminator(gen_output)

        g_loss = loss_func(dis_output, real_label)

        g_loss.backward()
        g_optimizer.step()
        if g_loss.item() < min_loss:
            torch.save(generator.state_dict(), gan_model_name)
    if epoch % 10 == 0:
        print("Epoch: {}, G_Loss: {:.4f}, D_Loss: {:.4f}".format(epoch, g_loss.item(), d_loss.item()))


Epoch: 0, G_Loss: 0.5871, D_Loss: 1.0054
Epoch: 10, G_Loss: 0.5849, D_Loss: 1.0103
Epoch: 20, G_Loss: 0.5805, D_Loss: 1.0124
Epoch: 30, G_Loss: 0.5787, D_Loss: 1.0138
Epoch: 40, G_Loss: 0.5776, D_Loss: 1.0148
Epoch: 50, G_Loss: 0.5790, D_Loss: 1.0180
Epoch: 60, G_Loss: 0.5804, D_Loss: 1.0193
Epoch: 70, G_Loss: 0.5782, D_Loss: 1.0245
Epoch: 80, G_Loss: 0.5746, D_Loss: 1.0274
Epoch: 90, G_Loss: 0.5674, D_Loss: 1.0279
Epoch: 100, G_Loss: 0.5705, D_Loss: 1.0258
Epoch: 110, G_Loss: 0.5702, D_Loss: 1.0203
Epoch: 120, G_Loss: 0.5748, D_Loss: 1.0132
Epoch: 130, G_Loss: 0.5810, D_Loss: 1.0047
Epoch: 140, G_Loss: 0.5820, D_Loss: 0.9982
Epoch: 150, G_Loss: 0.5876, D_Loss: 0.9915
Epoch: 160, G_Loss: 0.5901, D_Loss: 0.9886
Epoch: 170, G_Loss: 0.5940, D_Loss: 0.9816
Epoch: 180, G_Loss: 0.5984, D_Loss: 0.9788
Epoch: 190, G_Loss: 0.6028, D_Loss: 0.9731
Epoch: 200, G_Loss: 0.6045, D_Loss: 0.9700
Epoch: 210, G_Loss: 0.6069, D_Loss: 0.9685
Epoch: 220, G_Loss: 0.6060, D_Loss: 0.9679
Epoch: 230, G_Loss: 0.

# 生成样本数据

## 查看需要生成多少样本数据

In [21]:
Liver_data = pd.read_csv('GSE20194_merge.csv')
Liver_data

Unnamed: 0,MIR4640...DDR1,RFC2,HSPA6,PAX8,GUCA1A,MIR5193...UBA7,THRA,PTPN21,CCL5,CYP2E1,...,X.1118,X.1119,X.1120,X.1121,X.1122,X.1123,X.1124,X.1125,X.1126,group
0,12.1237,8.7101,7.8795,10.6651,6.3526,8.2374,7.7251,5.5228,7.7670,6.8468,...,9.1148,15.9891,15.8078,3.7453,5.4397,2.8710,2.7827,2.8683,1.9794,0
1,11.0980,9.7493,6.7152,10.4700,6.1595,8.0962,6.8715,6.9222,7.4830,5.4643,...,7.6801,15.0246,14.7647,3.8925,2.9347,4.1339,1.9527,2.5523,2.1316,0
2,11.8757,8.0035,8.1103,10.6567,5.7084,8.7424,7.2557,5.0792,7.1722,5.6942,...,8.7441,16.0024,15.8855,3.2907,3.0822,5.7312,2.6036,3.8544,3.1530,0
3,12.5807,8.0104,7.5885,10.1383,6.7935,9.2063,8.0392,5.0627,8.2889,6.2767,...,8.6348,16.1373,15.8473,3.0491,3.2856,6.0368,4.6105,3.4868,3.6321,0
4,12.4304,7.6919,8.0937,10.1545,5.2033,8.8423,8.6325,6.1289,9.7141,6.2651,...,7.9115,15.7183,15.6917,3.5532,3.9991,5.7407,4.1208,5.6764,3.3179,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,11.8023,7.8182,8.0532,11.2096,6.7603,10.0453,8.3714,4.8747,10.3120,7.1426,...,10.9442,16.0837,15.6543,5.2000,4.1865,4.5330,3.1679,5.9392,3.5784,1
274,11.4574,8.0488,8.2875,11.0754,6.8105,9.0332,8.4071,5.3538,11.2255,6.3486,...,10.9484,16.1763,15.8729,4.3030,4.0989,3.8379,5.6705,5.8172,2.1837,1
275,11.9181,7.3890,8.2965,10.8386,6.7350,8.9390,8.2275,5.9537,11.9945,6.1400,...,9.9604,15.6575,15.3779,5.2125,3.8788,4.0721,4.4384,2.3209,2.6036,1
276,11.7012,7.7375,8.5987,11.3588,7.5786,9.5624,8.5748,8.1305,6.7621,6.8786,...,10.8120,16.4401,15.9739,5.3790,4.3518,4.8250,2.7099,4.8099,3.5340,1


In [22]:
gen_num = len(Liver_data[Liver_data['group'] == 1]) - len(Liver_data[Liver_data['group'] == 0])
gen_num

166

In [23]:
gen_z = torch.randn(gen_num, input_dim).to(device)
gen_z.shape

torch.Size([166, 64])

In [24]:
gen_data = generator(gen_z)
gen_data

tensor([[ 0.2306,  0.1498,  0.1071,  ...,  0.0028, -0.1453,  0.0810],
        [ 0.0316,  0.2035, -0.0397,  ..., -0.2219, -0.1080,  0.0519],
        [ 0.0489,  0.2132, -0.0508,  ...,  0.1057, -0.0746,  0.1546],
        ...,
        [ 0.1381,  0.3370, -0.0667,  ..., -0.0547, -0.0470, -0.0187],
        [-0.0165,  0.2081, -0.2139,  ..., -0.0531, -0.0848,  0.1195],
        [ 0.1605,  0.0727, -0.0980,  ..., -0.0684,  0.0398,  0.1790]],
       device='cuda:0', grad_fn=<TanhBackward0>)

## 解码

In [25]:
gen_normal_data = model.decoder(gen_data)
gen_normal_data

tensor([[0.5215, 0.4825, 0.4941,  ..., 0.4170, 0.4519, 0.4414],
        [0.5195, 0.4907, 0.5002,  ..., 0.4165, 0.4381, 0.4443],
        [0.5186, 0.4932, 0.5027,  ..., 0.4354, 0.4611, 0.4470],
        ...,
        [0.5149, 0.4848, 0.4819,  ..., 0.4146, 0.4501, 0.4357],
        [0.5167, 0.4859, 0.4912,  ..., 0.4218, 0.4488, 0.4374],
        [0.5252, 0.4926, 0.5043,  ..., 0.4129, 0.4604, 0.4388]],
       device='cuda:0', grad_fn=<SigmoidBackward0>)

## minmax反归一化

In [26]:
gen_denormal_data = min_max_denormalize(gen_normal_data, ae_min, ae_max)
gen_denormal_data

tensor([[8.9493, 8.0855, 8.3436,  ..., 6.6367, 7.4098, 7.1763],
        [8.9037, 8.2686, 8.4786,  ..., 6.6260, 7.1040, 7.2411],
        [8.8844, 8.3239, 8.5336,  ..., 7.0453, 7.6135, 7.3016],
        ...,
        [8.8031, 8.1380, 8.0731,  ..., 6.5842, 7.3690, 7.0504],
        [8.8437, 8.1619, 8.2782,  ..., 6.7434, 7.3415, 7.0887],
        [9.0309, 8.3089, 8.5683,  ..., 6.5480, 7.5966, 7.1200]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [27]:
gen_data_np = gen_denormal_data.cpu().detach().numpy()
gen_data_np

array([[8.949321 , 8.08549  , 8.34357  , ..., 6.6366873, 7.4098005,
        7.176321 ],
       [8.903719 , 8.268575 , 8.478637 , ..., 6.6259947, 7.1039953,
        7.241062 ],
       [8.884363 , 8.323922 , 8.533588 , ..., 7.0453157, 7.613474 ,
        7.3015747],
       ...,
       [8.803086 , 8.138028 , 8.073134 , ..., 6.5841513, 7.3689575,
        7.0503817],
       [8.84372  , 8.161943 , 8.27822  , ..., 6.743368 , 7.341528 ,
        7.088687 ],
       [9.030913 , 8.308897 , 8.568295 , ..., 6.5480127, 7.5966454,
        7.12002  ]], dtype=float32)

In [28]:
zeros_column = np.zeros((gen_data_np.shape[0], 1))

# 将列向量与原始数组水平堆叠
gen_data_np = np.hstack((gen_data_np, zeros_column))
gen_data_np, gen_data_np.shape

(array([[8.94932079, 8.08549023, 8.34356976, ..., 7.40980053, 7.17632103,
         0.        ],
        [8.90371895, 8.26857471, 8.47863674, ..., 7.10399532, 7.24106216,
         0.        ],
        [8.88436317, 8.32392216, 8.53358841, ..., 7.61347389, 7.30157471,
         0.        ],
        ...,
        [8.80308628, 8.13802814, 8.07313442, ..., 7.36895752, 7.05038166,
         0.        ],
        [8.84372044, 8.16194344, 8.27822018, ..., 7.34152794, 7.08868694,
         0.        ],
        [9.03091335, 8.30889702, 8.56829453, ..., 7.59664536, 7.12001991,
         0.        ]]),
 (166, 22284))

In [29]:
# 将NumPy数组转换为DataFrame
arr_df = pd.DataFrame(gen_data_np, columns=Liver_data.columns.values)

# 将两个DataFrame对象沿着行的方向连接
new_Liver_df = pd.concat([Liver_data, arr_df], axis=0)

In [30]:
new_Liver_df

Unnamed: 0,MIR4640...DDR1,RFC2,HSPA6,PAX8,GUCA1A,MIR5193...UBA7,THRA,PTPN21,CCL5,CYP2E1,...,X.1118,X.1119,X.1120,X.1121,X.1122,X.1123,X.1124,X.1125,X.1126,group
0,12.123700,8.710100,7.879500,10.665100,6.352600,8.237400,7.725100,5.522800,7.767000,6.846800,...,9.114800,15.989100,15.807800,3.745300,5.439700,2.871000,2.782700,2.868300,1.979400,0.0
1,11.098000,9.749300,6.715200,10.470000,6.159500,8.096200,6.871500,6.922200,7.483000,5.464300,...,7.680100,15.024600,14.764700,3.892500,2.934700,4.133900,1.952700,2.552300,2.131600,0.0
2,11.875700,8.003500,8.110300,10.656700,5.708400,8.742400,7.255700,5.079200,7.172200,5.694200,...,8.744100,16.002400,15.885500,3.290700,3.082200,5.731200,2.603600,3.854400,3.153000,0.0
3,12.580700,8.010400,7.588500,10.138300,6.793500,9.206300,8.039200,5.062700,8.288900,6.276700,...,8.634800,16.137300,15.847300,3.049100,3.285600,6.036800,4.610500,3.486800,3.632100,0.0
4,12.430400,7.691900,8.093700,10.154500,5.203300,8.842300,8.632500,6.128900,9.714100,6.265100,...,7.911500,15.718300,15.691700,3.553200,3.999100,5.740700,4.120800,5.676400,3.317900,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,8.954011,8.289443,8.580814,8.549145,7.949219,9.047162,8.509266,7.866513,9.219642,8.381490,...,8.531967,11.467782,11.163027,6.889524,7.166822,7.628172,6.407483,7.211306,6.980609,0.0
162,8.946689,8.129368,8.495572,8.366379,7.989347,8.805641,8.322453,7.760155,9.048155,8.497838,...,8.406697,10.815388,10.679202,7.216028,7.344364,7.687232,6.824277,7.449362,7.382874,0.0
163,8.803086,8.138028,8.073134,8.381591,7.858770,8.747678,8.156619,7.512436,8.891790,8.120791,...,8.654006,11.218925,11.202312,6.629396,7.373271,7.605550,6.584151,7.368958,7.050382,0.0
164,8.843720,8.161943,8.278220,8.440951,7.993417,8.722024,8.345293,7.674944,8.720527,8.445460,...,8.291999,10.823193,10.648807,7.005806,7.229563,7.693204,6.743368,7.341528,7.088687,0.0


In [31]:
new_Liver_df.to_csv('gan_GSE20194.csv', index=False)

In [32]:
gan_Liver_N = new_Liver_df[new_Liver_df['group'] == 0].drop('group',axis=1)

In [33]:
Liver_N_df = pd.read_csv('GSE20194_pCR.csv')
Liver_N_df.columns.values[0] = 'gene_name'
Liver_N_df

Unnamed: 0,gene_name,GSM505335,GSM505336,GSM505339,GSM505341,GSM505342,GSM505344,GSM505349,GSM505351,GSM505356,...,GSM505540,GSM505545,GSM505559,GSM505562,GSM505564,GSM505566,GSM505580,GSM505583,GSM505584,GSM505598
0,MIR4640...DDR1,12.1237,11.0980,11.8757,12.5807,12.4304,12.1154,12.3641,11.2810,12.8908,...,11.2111,10.7914,12.0608,11.5006,11.9590,12.3984,12.3869,11.5913,11.9540,11.9979
1,RFC2,8.7101,9.7493,8.0035,8.0104,7.6919,7.9201,8.9391,8.6218,8.0940,...,7.3230,8.6287,7.8334,9.1396,9.4844,8.5418,8.4071,9.9503,7.9140,7.7001
2,HSPA6,7.8795,6.7152,8.1103,7.5885,8.0937,6.2070,7.8820,8.3951,9.3695,...,8.6958,6.9254,8.1746,7.8892,7.2895,7.4417,8.0141,7.1183,7.9862,6.4521
3,PAX8,10.6651,10.4700,10.6567,10.1383,10.1545,10.5313,10.1676,10.6988,10.7094,...,11.0041,10.3318,10.8744,10.3686,10.5525,10.7718,10.5859,10.4632,11.2998,10.9807
4,GUCA1A,6.3526,6.1595,5.7084,6.7935,5.2033,5.2639,5.9152,6.4474,3.5129,...,7.2929,7.0566,6.8828,5.5763,6.4399,5.9748,6.1521,5.9959,7.1580,7.1452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22278,X.1122,5.4397,2.9347,3.0822,3.2856,3.9991,3.4611,3.0957,5.5843,2.7191,...,2.5796,4.2308,3.8831,3.0919,4.0226,3.6437,3.1812,2.7237,4.6803,3.3462
22279,X.1123,2.8710,4.1339,5.7312,6.0368,5.7407,3.8886,3.0149,5.7290,6.0254,...,3.9401,6.3050,3.6722,5.5273,5.2616,4.1453,4.2605,5.3282,3.9274,5.9248
22280,X.1124,2.7827,1.9527,2.6036,4.6105,4.1208,3.8786,4.4895,2.4082,3.5314,...,1.7461,4.0598,3.1572,1.7910,2.6986,3.1844,1.7573,3.3294,5.6962,2.7505
22281,X.1125,2.8683,2.5523,3.8544,3.4868,5.6764,3.4884,3.1887,4.0703,5.1805,...,5.1508,4.3344,5.2353,2.8005,6.3545,5.8272,3.3362,2.7174,4.6191,5.6207


In [34]:
# 使用reset_index方法重新排序索引
gan_Liver_N = gan_Liver_N.reset_index(drop=True)

In [35]:
# 使用rename方法修改列名
gan_Liver_N = gan_Liver_N.T.rename(columns=lambda x: 'GSM' + str(x))
gan_Liver_N

Unnamed: 0,GSM0,GSM1,GSM2,GSM3,GSM4,GSM5,GSM6,GSM7,GSM8,GSM9,...,GSM212,GSM213,GSM214,GSM215,GSM216,GSM217,GSM218,GSM219,GSM220,GSM221
MIR4640...DDR1,12.1237,11.0980,11.8757,12.5807,12.4304,12.1154,12.3641,11.2810,12.8908,12.0125,...,8.736911,8.891892,8.845691,8.902140,9.018620,8.954011,8.946689,8.803086,8.843720,9.030913
RFC2,8.7101,9.7493,8.0035,8.0104,7.6919,7.9201,8.9391,8.6218,8.0940,8.3706,...,8.259678,8.063509,8.299884,8.183553,8.069152,8.289443,8.129368,8.138028,8.161943,8.308897
HSPA6,7.8795,6.7152,8.1103,7.5885,8.0937,6.2070,7.8820,8.3951,9.3695,7.5099,...,8.400273,8.311166,8.225692,8.206296,8.392560,8.580814,8.495572,8.073134,8.278220,8.568295
PAX8,10.6651,10.4700,10.6567,10.1383,10.1545,10.5313,10.1676,10.6988,10.7094,10.5047,...,8.407107,8.544320,8.370914,8.387035,8.551643,8.549145,8.366379,8.381591,8.440951,8.513543
GUCA1A,6.3526,6.1595,5.7084,6.7935,5.2033,5.2639,5.9152,6.4474,3.5129,6.6090,...,7.882220,7.911414,7.892449,7.940697,7.906001,7.949219,7.989347,7.858770,7.993417,8.151786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X.1122,5.4397,2.9347,3.0822,3.2856,3.9991,3.4611,3.0957,5.5843,2.7191,4.2405,...,7.613745,7.056196,7.430583,7.357050,7.326297,7.166822,7.344364,7.373271,7.229563,7.102919
X.1123,2.8710,4.1339,5.7312,6.0368,5.7407,3.8886,3.0149,5.7290,6.0254,3.0093,...,7.633843,7.598717,7.669730,7.593400,7.694119,7.628172,7.687232,7.605550,7.693204,7.600895
X.1124,2.7827,1.9527,2.6036,4.6105,4.1208,3.8786,4.4895,2.4082,3.5314,3.4970,...,6.780201,6.506226,6.853326,6.522325,6.695195,6.407483,6.824277,6.584151,6.743368,6.548013
X.1125,2.8683,2.5523,3.8544,3.4868,5.6764,3.4884,3.1887,4.0703,5.1805,5.1359,...,7.503563,7.293713,7.515451,7.330578,7.375084,7.211306,7.449362,7.368958,7.341528,7.596645


In [36]:
gan_Liver_N.to_csv('gan_GSE20194_pCR.csv')

In [37]:
gan_Liver_T = new_Liver_df[new_Liver_df['group'] == 1].drop('group',axis=1)
gan_Liver_T

Unnamed: 0,MIR4640...DDR1,RFC2,HSPA6,PAX8,GUCA1A,MIR5193...UBA7,THRA,PTPN21,CCL5,CYP2E1,...,X.1117,X.1118,X.1119,X.1120,X.1121,X.1122,X.1123,X.1124,X.1125,X.1126
56,12.4440,8.3774,6.7866,10.2851,5.9064,8.3767,8.0356,6.6745,6.2325,6.8450,...,7.4678,9.3738,15.6236,15.2785,3.2915,3.6526,2.6412,1.2652,3.0690,2.0271
57,12.2005,7.8592,8.0963,10.4624,4.9582,9.2973,7.0581,6.4607,6.9047,5.8878,...,9.6656,8.8500,15.3234,15.1286,3.3811,2.5880,4.4798,4.8098,3.1637,2.4758
58,12.6709,8.6762,7.4812,10.1887,5.2332,9.1721,8.6061,7.0932,6.5940,5.6843,...,7.6012,8.2567,15.4604,15.2674,3.1665,3.9743,5.2597,4.3815,2.8034,2.4669
59,11.6619,8.2557,7.9923,10.7705,6.3296,9.3777,8.4776,6.5878,6.0877,6.5169,...,7.6331,9.0089,15.5185,15.1655,4.0045,3.8503,5.9114,0.7882,3.1831,3.4820
60,11.8397,8.7971,7.8321,10.2869,5.8389,7.0841,7.3419,7.3167,6.3456,6.1708,...,8.0249,9.2004,15.3143,14.9506,3.0514,3.2946,5.1537,3.9179,3.1881,2.9769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,11.8023,7.8182,8.0532,11.2096,6.7603,10.0453,8.3714,4.8747,10.3120,7.1426,...,9.5479,10.9442,16.0837,15.6543,5.2000,4.1865,4.5330,3.1679,5.9392,3.5784
274,11.4574,8.0488,8.2875,11.0754,6.8105,9.0332,8.4071,5.3538,11.2255,6.3486,...,9.6936,10.9484,16.1763,15.8729,4.3030,4.0989,3.8379,5.6705,5.8172,2.1837
275,11.9181,7.3890,8.2965,10.8386,6.7350,8.9390,8.2275,5.9537,11.9945,6.1400,...,7.7356,9.9604,15.6575,15.3779,5.2125,3.8788,4.0721,4.4384,2.3209,2.6036
276,11.7012,7.7375,8.5987,11.3588,7.5786,9.5624,8.5748,8.1305,6.7621,6.8786,...,8.8167,10.8120,16.4401,15.9739,5.3790,4.3518,4.8250,2.7099,4.8099,3.5340


In [57]:
gan_Liver_T.T.to_csv('gan_Liver_T.csv')

In [38]:
gan_Liver_N.shape

(22283, 222)

In [39]:
gan_Liver_T.shape

(222, 22283)