In [1]:
import pandas as pd
import numpy as np
import os

此流程是将上游定量后的counts，批量转为FPKM、TPM值

In [2]:
# 设置菌种关键词
Keyname = 'Nc'
path = '/Users/dongjiacheng/Desktop/coder/mtd/RNA-seq/'+ Keyname +'/'

print(path)

/Users/dongjiacheng/Desktop/coder/mtd/RNA-seq/Nc/


1. 将Counts文件夹的readcount的txt文件，转为FPKM，保存为csv文件

In [3]:
def rpkm(data):
    columns = data.columns  # 存表的列名
    table = data.iloc[:, :6]  # 读取前6列

    for i in range(data.shape[1] - 6):
        reads = data.iloc[:, i + 6]
        total_counts = reads.sum() / 1000000
        RPM = reads / total_counts
        fpkm_result = RPM * 1000 / (data["Length"])  # Length是表中的某一列，是基因的长度
        table = pd.concat([table, fpkm_result], axis=1)

    table.columns = columns
    return table



def txt_to_csv_rpkm(txt_read_path, csv_output_path):
    """
    Description: 批量将txt文件转为csv文件，并处理为fpkm
    """
    try:
        for file_name in os.listdir(txt_read_path):
            data = pd.read_table(txt_read_path + "/" + file_name, sep="\t", header=1)
            RPKM_data = rpkm(data)
            # 保留2位小数
            RPKM_data = RPKM_data.round(2)
            # 改样本名称
            RPKM_data = RPKM_data.rename(
                columns=lambda x: x.replace(".bam", "").replace('"', "")
            )
            file_name = file_name.split(".")[0] + ".csv"
            RPKM_data.to_csv(csv_output_path + "/" + file_name, sep=",", index=False)

    except Exception as e:
        print(e)
        return False

txt_to_csv_rpkm('./Nc/Counts/', './Nc/FPKM/')

2. 合并FPKM的csv文件

In [4]:
def merge_csv_col(first_csv_path, csv_output_path):
    """
    将输出目录中所有的csv文件的第7列数据进行合并，并返回合并后的表格数据
    """
    first = pd.read_csv(first_csv_path, sep=",")
    New_table = first.iloc[:, 0:6]  # 取出第一个CSV文件前六列，用于创建新表

    file_list = sorted(
        [file for file in os.listdir(csv_output_path) if file.endswith(".csv")]
    )  # 读取并排序所有的csv文件列表

    for file_name in file_list:
        data = pd.read_csv(csv_output_path + file_name, sep=",")
        New_table = pd.concat(
            [New_table, data.iloc[:, 6]], axis=1
        )  # 将所有csv的第7列拼接到一个表的基因号后面

    return New_table


df_merge = merge_csv_col('./Nc/FPKM/SRR10078920.csv','./Nc/FPKM/')

# 保存合并后的表格数据
df_merge.to_csv(path + Keyname + '_1_FPKM.csv', sep=',', index=False)

df_merge

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10078920,SRR10078921,SRR1025973,SRR1025974,...,SRR9214968,SRR959443,SRR959444,SRR9601987,SRR9601988,SRR999602,SRR999604,SRR999605,SRR999606,SRR999607
0,NCU10129,NC_026501.1,1151,2878,-,1728,21.84,20.15,0.47,0.63,...,6.81,2.28,36.58,0.71,0.86,6.13,7.55,3.87,1.91,3.10
1,NCU09901,NC_026501.1,3178,6443,+,3266,7.08,9.90,15.15,15.25,...,11.44,10.97,30.22,0.66,1.48,11.97,12.40,8.75,11.83,10.59
2,NCU09903,NC_026501.1,9177,13334,+,4158,0.73,2.16,42.54,47.06,...,68.48,26.35,35.37,0.97,2.09,15.79,15.86,11.41,30.68,26.72
3,NCU11134,NC_026501.1,15930,16647,+,718,0.00,0.00,0.00,0.12,...,0.38,0.34,20.66,0.00,0.13,0.13,0.20,0.00,0.19,0.26
4,NCU09904,NC_026501.1,17441,19615,+,2175,8.12,14.51,0.34,0.73,...,0.96,0.34,12.31,0.38,0.17,18.59,20.32,10.27,1.22,1.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10585,NCU16025,NC_026614.1,55087,57247,+,2161,5.12,4.89,4.10,5.64,...,0.43,1.99,237.59,3.66,7.15,4.54,4.02,4.82,2.29,4.81
10586,NCU16026,NC_026614.1,55087,56529,+,1443,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
10587,NCU16027,NC_026614.1,58608,58832,+,225,58.73,38.26,7.17,21.87,...,1.22,33.38,1184.66,8.69,13.00,59.41,40.75,30.80,49.71,79.21
10588,NCU16028,NC_026614.1,59529,60281,+,753,58.04,51.44,6.82,14.40,...,2.70,30.90,757.43,13.13,23.55,65.50,53.66,28.06,43.04,75.57


3. 将合并的FPKM文件，计算出TPM值

In [5]:
# 将合并的FPKM文件，计算出TPM值
def tpm(data):
    columns = data.columns  # 存表的列名
    table = data.iloc[:, :6]  # 读取前6列

    for i in range(data.shape[1] - 6):
        reads = data.iloc[:, i + 6]
        total_counts = reads.sum() / 1000000
        TPM = reads / total_counts
        table = pd.concat([table, TPM], axis=1)

    table.columns = columns
    return table

df_tpm = tpm(df_merge)
df_tpm = df_tpm.round(2)
df_tpm.to_csv(path + Keyname + '_2_TPM.csv', sep=',', index=False)
df_tpm

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10078920,SRR10078921,SRR1025973,SRR1025974,...,SRR9214968,SRR959443,SRR959444,SRR9601987,SRR9601988,SRR999602,SRR999604,SRR999605,SRR999606,SRR999607
0,NCU10129,NC_026501.1,1151,2878,-,1728,47.38,44.05,1.00,1.43,...,14.81,5.45,75.13,1.14,1.44,14.87,18.25,9.33,4.65,7.58
1,NCU09901,NC_026501.1,3178,6443,+,3266,15.36,21.64,32.18,34.58,...,24.88,26.23,62.07,1.06,2.48,29.03,29.98,21.09,28.82,25.88
2,NCU09903,NC_026501.1,9177,13334,+,4158,1.58,4.72,90.35,106.70,...,148.91,63.00,72.65,1.56,3.50,38.30,38.35,27.50,74.75,65.30
3,NCU11134,NC_026501.1,15930,16647,+,718,0.00,0.00,0.00,0.27,...,0.83,0.81,42.43,0.00,0.22,0.32,0.48,0.00,0.46,0.64
4,NCU09904,NC_026501.1,17441,19615,+,2175,17.61,31.72,0.72,1.66,...,2.09,0.81,25.28,0.61,0.29,45.09,49.13,24.75,2.97,2.57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10585,NCU16025,NC_026614.1,55087,57247,+,2161,11.11,10.69,8.71,12.79,...,0.94,4.76,487.98,5.88,11.99,11.01,9.72,11.62,5.58,11.76
10586,NCU16026,NC_026614.1,55087,56529,+,1443,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
10587,NCU16027,NC_026614.1,58608,58832,+,225,127.40,83.64,15.23,49.59,...,2.65,79.81,2433.13,13.95,21.80,144.09,98.53,74.22,121.11,193.59
10588,NCU16028,NC_026614.1,59529,60281,+,753,125.90,112.45,14.49,32.65,...,5.87,73.88,1555.66,21.08,39.49,158.86,129.74,67.62,104.86,184.69


4. 对readcount进行合并

In [6]:
def txt_to_csv(txt_read_path, csv_output_path):
    """
    Description: 批量将txt文件转为csv文件
    """
    try:
        for file_name in os.listdir(txt_read_path):
            data = pd.read_table(txt_read_path + "/" + file_name, sep="\t", header=1)
            file_name = file_name.split(".")[0] + ".csv"
            data.to_csv(csv_output_path + "/" + file_name, sep=",", index=False)

    except Exception as e:
        print(e)
        return False


txt_to_csv(path + 'Counts/', path + 'Counts_csv/')

In [7]:
df_merge_counts = merge_csv_col('./Nc/Counts_csv/SRR10078920.csv','./Nc/Counts_csv/')
df_merge_counts = df_merge_counts.rename(columns=lambda x: x.replace(".bam","").replace('"',''))

df_merge_counts.to_csv(path + Keyname + '_3_Counts.csv', sep=',', index=False)
df_merge_counts

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10078920,SRR10078921,SRR1025973,SRR1025974,...,SRR9214968,SRR959443,SRR959444,SRR9601987,SRR9601988,SRR999602,SRR999604,SRR999605,SRR999606,SRR999607
0,NCU10129,NC_026501.1,1151,2878,-,1728,297,267,11,13,...,214,32,392,53,48,230,269,138,72,117
1,NCU09901,NC_026501.1,3178,6443,+,3266,182,248,675,597,...,680,291,612,94,155,848,835,590,843,755
2,NCU09903,NC_026501.1,9177,13334,+,4158,24,69,2413,2346,...,5180,890,912,176,279,1424,1359,979,2783,2425
3,NCU11134,NC_026501.1,15930,16647,+,718,0,0,0,1,...,5,2,92,0,3,2,3,0,3,4
4,NCU09904,NC_026501.1,17441,19615,+,2175,139,242,10,19,...,38,6,166,36,12,877,911,461,58,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10585,NCU16025,NC_026614.1,55087,57247,+,2161,87,81,121,146,...,17,35,3184,344,497,213,179,215,108,227
10586,NCU16026,NC_026614.1,55087,56529,+,1443,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10587,NCU16027,NC_026614.1,58608,58832,+,225,104,66,22,59,...,5,61,1653,85,94,290,189,143,244,389
10588,NCU16028,NC_026614.1,59529,60281,+,753,344,297,70,130,...,37,189,3537,430,570,1070,833,436,707,1242
