In [8]:
import pandas as pd
import numpy as np
import os

此流程是将上游定量后的counts，批量转为FPKM、TPM值

In [9]:
# 设置菌种关键词
Keyname = 'Mt'
path = '/Users/dongjiacheng/Desktop/coder/mtd/RNA-seq/'+ Keyname +'/'

print(path)

/Users/dongjiacheng/Desktop/coder/mtd/RNA-seq/Mt/


1. 将Counts文件夹的readcount的txt文件，转为FPKM，保存为csv文件

In [10]:
def rpkm(data):
    columns = data.columns  # 存表的列名
    table = data.iloc[:, :6]  # 读取前6列

    for i in range(data.shape[1] - 6):
        reads = data.iloc[:, i + 6]
        total_counts = reads.sum() / 1000000
        RPM = reads / total_counts
        fpkm_result = RPM * 1000 / (data["Length"])  # Length是表中的某一列，是基因的长度
        table = pd.concat([table, fpkm_result], axis=1)

    table.columns = columns
    return table



def txt_to_csv_rpkm(txt_read_path, csv_output_path):
    """
    Description: 批量将txt文件转为csv文件，并处理为fpkm
    """
    try:
        for file_name in os.listdir(txt_read_path):
            data = pd.read_table(txt_read_path + "/" + file_name, sep="\t", header=1)
            RPKM_data = rpkm(data)
            # 保留2位小数
            RPKM_data = RPKM_data.round(2)
            # 改样本名称
            RPKM_data = RPKM_data.rename(
                columns=lambda x: x.replace(".bam", "").replace('"', "")
            )
            file_name = file_name.split(".")[0] + ".csv"
            RPKM_data.to_csv(csv_output_path + "/" + file_name, sep=",", index=False)

    except Exception as e:
        print(e)
        return False

txt_to_csv_rpkm(path + 'Counts/', path + 'FPKM/')

2. 合并FPKM的csv文件

In [11]:
def merge_csv_col(first_csv_path, csv_output_path):
    """
    将输出目录中所有的csv文件的第7列数据进行合并，并返回合并后的表格数据
    """
    first = pd.read_csv(first_csv_path, sep=",")
    New_table = first.iloc[:, 0:6]  # 取出第一个CSV文件前六列，用于创建新表

    file_list = sorted(
        [file for file in os.listdir(csv_output_path) if file.endswith(".csv")]
    )  # 读取并排序所有的csv文件列表

    for file_name in file_list:
        data = pd.read_csv(csv_output_path + file_name, sep=",")
        New_table = pd.concat(
            [New_table, data.iloc[:, 6]], axis=1
        )  # 将所有csv的第7列拼接到一个表的基因号后面

    return New_table


df_merge = merge_csv_col('./Mt/FPKM/SRR10099850.csv','./Mt/FPKM/')

# 保存合并后的表格数据
df_merge.to_csv(path + Keyname + '_1_FPKM.csv', sep=',', index=False)

df_merge

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10099850,SRR10099851,SRR10099852,SRR10099853,...,SRR7121120,SRR7121121,SRR7121122,SRR7121123,SRR7121124,SRR7121125,SRR7121126,SRR7121127,SRR7121128,SRR7121129
0,MYCTH_2114025,NC_016472.1,1381,2142,+,762,4.92,7.73,8.38,7.09,...,3.22,2.69,2.96,4.12,8.42,7.92,1.48,2.12,4.35,3.83
1,MYCTH_2293935,NC_016472.1,2744,3343,-,600,2.97,4.12,2.16,2.89,...,1.02,3.09,1.59,2.15,2.89,5.24,0.82,1.11,2.12,1.66
2,MYCTH_2293936,NC_016472.1,3344,3817,-,474,0.94,0.80,0.59,0.43,...,0.60,3.71,1.05,1.36,0.97,1.41,0.41,0.59,0.27,0.65
3,MYCTH_2051335,NC_016472.1,5330,6385,-,1056,12.33,9.44,6.83,8.88,...,8.86,8.28,13.23,11.89,13.55,17.53,6.56,8.59,11.52,9.29
4,MYCTH_2121898,NC_016472.1,15458,25533,+,10076,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9287,MYCTH_103797,NC_016478.1,4089438,4091337,+,1900,0.00,0.10,0.54,0.32,...,1.42,2.24,2.78,2.23,1.26,1.06,0.28,0.85,1.07,0.95
9288,MYCTH_2071098,NC_016478.1,4095568,4097284,+,1717,2.96,3.54,2.69,2.43,...,5.42,5.24,7.42,6.75,8.78,8.53,8.93,8.72,8.01,7.05
9289,MYCTH_2114022,NC_016478.1,4098122,4098907,-,786,0.23,0.00,0.00,0.00,...,0.00,0.31,0.05,0.00,0.06,0.00,0.00,0.07,0.08,0.08
9290,MYCTH_2114023,NC_016478.1,4100606,4101293,-,688,0.13,0.14,0.00,0.00,...,0.41,0.43,0.17,0.60,0.67,0.97,0.36,0.97,0.37,0.27


3. 将合并的FPKM文件，计算出TPM值

In [5]:
# 将合并的FPKM文件，计算出TPM值
def tpm(data):
    columns = data.columns  # 存表的列名
    table = data.iloc[:, :6]  # 读取前6列

    for i in range(data.shape[1] - 6):
        reads = data.iloc[:, i + 6]
        total_counts = reads.sum() / 1000000
        TPM = reads / total_counts
        table = pd.concat([table, TPM], axis=1)

    table.columns = columns
    return table

df_tpm = tpm(df_merge)
df_tpm = df_tpm.round(2)
df_tpm.to_csv(path + Keyname + '_2_TPM.csv', sep=',', index=False)
df_tpm

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10099850,SRR10099851,SRR10099852,SRR10099853,...,SRR7121120,SRR7121121,SRR7121122,SRR7121123,SRR7121124,SRR7121125,SRR7121126,SRR7121127,SRR7121128,SRR7121129
0,MYCTH_2114025,NC_016472.1,1381,2142,+,762,8.73,14.08,15.02,13.02,...,6.00,5.04,5.44,7.43,15.25,14.10,2.67,3.73,7.60,6.57
1,MYCTH_2293935,NC_016472.1,2744,3343,-,600,5.27,7.50,3.87,5.31,...,1.90,5.79,2.92,3.88,5.24,9.33,1.48,1.96,3.70,2.85
2,MYCTH_2293936,NC_016472.1,3344,3817,-,474,1.67,1.46,1.06,0.79,...,1.12,6.95,1.93,2.45,1.76,2.51,0.74,1.04,0.47,1.12
3,MYCTH_2051335,NC_016472.1,5330,6385,-,1056,21.89,17.19,12.24,16.31,...,16.50,15.50,24.33,21.43,24.55,31.20,11.82,15.13,20.12,15.94
4,MYCTH_2121898,NC_016472.1,15458,25533,+,10076,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9287,MYCTH_103797,NC_016478.1,4089438,4091337,+,1900,0.00,0.18,0.97,0.59,...,2.64,4.19,5.11,4.02,2.28,1.89,0.50,1.50,1.87,1.63
9288,MYCTH_2071098,NC_016478.1,4095568,4097284,+,1717,5.25,6.45,4.82,4.46,...,10.09,9.81,13.64,12.17,15.91,15.18,16.09,15.36,13.99,12.10
9289,MYCTH_2114022,NC_016478.1,4098122,4098907,-,786,0.41,0.00,0.00,0.00,...,0.00,0.58,0.09,0.00,0.11,0.00,0.00,0.12,0.14,0.14
9290,MYCTH_2114023,NC_016478.1,4100606,4101293,-,688,0.23,0.25,0.00,0.00,...,0.76,0.81,0.31,1.08,1.21,1.73,0.65,1.71,0.65,0.46


4. 对readcount进行合并

In [6]:
def txt_to_csv(txt_read_path, csv_output_path):
    """
    Description: 批量将txt文件转为csv文件
    """
    try:
        for file_name in os.listdir(txt_read_path):
            data = pd.read_table(txt_read_path + "/" + file_name, sep="\t", header=1)
            file_name = file_name.split(".")[0] + ".csv"
            data.to_csv(csv_output_path + "/" + file_name, sep=",", index=False)

    except Exception as e:
        print(e)
        return False


txt_to_csv(path + 'Counts/', path + 'Counts_csv/')

In [7]:
# df_merge_counts = merge_csv_col('./Nc/Counts_csv/SRR10078920.csv','./Nc/Counts_csv/')
df_merge_counts = merge_csv_col('./Mt/Counts_csv/SRR10099850.csv','./Mt/Counts_csv/')

df_merge_counts = df_merge_counts.rename(columns=lambda x: x.replace(".bam","").replace('"',''))
df_merge_counts.to_csv(path + Keyname + '_3_Counts.csv', sep=',', index=False)
df_merge_counts

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,SRR10099850,SRR10099851,SRR10099852,SRR10099853,...,SRR7121120,SRR7121121,SRR7121122,SRR7121123,SRR7121124,SRR7121125,SRR7121126,SRR7121127,SRR7121128,SRR7121129
0,MYCTH_2114025,NC_016472.1,1381,2142,+,762,42,62,69,53,...,52,42,59,68,126,117,23,29,52,47
1,MYCTH_2293935,NC_016472.1,2744,3343,-,600,20,26,14,17,...,13,38,25,28,34,61,10,12,20,16
2,MYCTH_2293936,NC_016472.1,3344,3817,-,474,5,4,3,2,...,6,36,13,14,9,13,4,5,2,5
3,MYCTH_2051335,NC_016472.1,5330,6385,-,1056,146,105,78,92,...,198,179,365,272,281,359,141,163,191,158
4,MYCTH_2121898,NC_016472.1,15458,25533,+,10076,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9287,MYCTH_103797,NC_016478.1,4089438,4091337,+,1900,0,2,11,6,...,57,87,138,92,47,39,11,29,32,29
9288,MYCTH_2071098,NC_016478.1,4095568,4097284,+,1717,57,64,50,41,...,197,184,333,251,296,284,312,269,216,195
9289,MYCTH_2114022,NC_016478.1,4098122,4098907,-,786,2,0,0,0,...,0,5,1,0,1,0,0,1,1,1
9290,MYCTH_2114023,NC_016478.1,4100606,4101293,-,688,1,1,0,0,...,6,6,3,9,9,13,5,12,4,3
