<a href="https://colab.research.google.com/github/codebuzer/RNA-seq-transcript-coverage-/blob/main/GeneCoverage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pysam
import time
import pandas as pd
import numpy as np
import subprocess
import pandas as pd
import time
import os

df = pd.read_csv("gtffile.csv", index_col=0, low_memory=False)


def calculate_gene_coverage(bam_file, df):
    # indexing of bam file
    start_index = time.time()
    subprocess.run(["samtools", "index", bam_file])
    end_index = time.time()
    print('Indexing Done For')
    print('Indexing takes', end_index - start_index, 'secs')
    start_align = time.time()
    # readoing a bam file
    bam = pysam.AlignmentFile(bam_file, "rb")
    end_align = time.time()

    print('Bam alignment take:', end_align - start_align, 'secs')

    coverage_data = []

    for _, gene in df.iterrows():
        chrom = gene['Region']
        start = gene['Start']
        end = gene['End']
        Region_name = gene['Region_name']
        gene_id_name = gene['Description'].split(";")[0].split(" ")[1].replace('"', '')

        region = end - start + 1
        start_count = time.time()
        # counting the coverage
        coverage = bam.count_coverage(chrom, start, end)

        # coverage contribuition of each base
        coverage_A = coverage[0]
        coverage_C = coverage[1]
        coverage_G = coverage[2]
        coverage_T = coverage[3]

        # number of mapped reads of each base
        num_reads_mapped_A = sum(coverage_A)
        num_reads_mapped_C = sum(coverage_C)
        num_reads_mapped_G = sum(coverage_G)
        num_reads_mapped_T = sum(coverage_T)

        # contraint to return only those coverage which are validated by atleast 10 reads
        covbases_A_1 = sum(sum(depth >= 1 for depth in depth_array) for depth_array in [list(coverage_A)])
        covbases_A_5 = sum(sum(depth >= 5 for depth in depth_array) for depth_array in [list(coverage_A)])
        covbases_A_10 = sum(sum(depth >= 10 for depth in depth_array) for depth_array in [list(coverage_A)])
        covbases_A_15 = sum(sum(depth >= 15 for depth in depth_array) for depth_array in [list(coverage_A)])
        covbases_A_20 = sum(sum(depth >= 20 for depth in depth_array) for depth_array in [list(coverage_A)])

        covbases_C_1 = sum(sum(depth >= 1 for depth in depth_array) for depth_array in [list(coverage_C)])
        covbases_C_5 = sum(sum(depth >= 5 for depth in depth_array) for depth_array in [list(coverage_C)])
        covbases_C_10 = sum(sum(depth >= 10 for depth in depth_array) for depth_array in [list(coverage_C)])
        covbases_C_15 = sum(sum(depth >= 15 for depth in depth_array) for depth_array in [list(coverage_C)])
        covbases_C_20 = sum(sum(depth >= 20 for depth in depth_array) for depth_array in [list(coverage_C)])

        covbases_G_1 = sum(sum(depth >= 1 for depth in depth_array) for depth_array in [list(coverage_G)])
        covbases_G_5 = sum(sum(depth >= 5 for depth in depth_array) for depth_array in [list(coverage_G)])
        covbases_G_10 = sum(sum(depth >= 10 for depth in depth_array) for depth_array in [list(coverage_G)])
        covbases_G_15 = sum(sum(depth >= 15 for depth in depth_array) for depth_array in [list(coverage_G)])
        covbases_G_20 = sum(sum(depth >= 20 for depth in depth_array) for depth_array in [list(coverage_G)])

        covbases_T_1 = sum(sum(depth >= 1 for depth in depth_array) for depth_array in [list(coverage_T)])
        covbases_T_5 = sum(sum(depth >= 5 for depth in depth_array) for depth_array in [list(coverage_T)])
        covbases_T_10 = sum(sum(depth >= 10 for depth in depth_array) for depth_array in [list(coverage_T)])
        covbases_T_15 = sum(sum(depth >= 15 for depth in depth_array) for depth_array in [list(coverage_T)])
        covbases_T_20 = sum(sum(depth >= 20 for depth in depth_array) for depth_array in [list(coverage_T)])

        # percentage contribuition of each base in the coverage of that particular region
        covpercent_A_1 = (covbases_A_1 / region) * 100
        covpercent_A_5 = (covbases_A_5 / region) * 100
        covpercent_A_10 = (covbases_A_10 / region) * 100
        covpercent_A_15 = (covbases_A_15 / region) * 100
        covpercent_A_20 = (covbases_A_20 / region) * 100

        covpercent_C_1 = (covbases_C_1 / region) * 100
        covpercent_C_5 = (covbases_C_5 / region) * 100
        covpercent_C_10 = (covbases_C_10 / region) * 100
        covpercent_C_15 = (covbases_C_15 / region) * 100
        covpercent_C_20 = (covbases_C_20 / region) * 100

        covpercent_G_1 = (covbases_G_1 / region) * 100
        covpercent_G_5 = (covbases_G_5 / region) * 100
        covpercent_G_10 = (covbases_G_10 / region) * 100
        covpercent_G_15 = (covbases_G_15 / region) * 100
        covpercent_G_20 = (covbases_G_20 / region) * 100

        covpercent_T_1 = (covbases_T_1 / region) * 100
        covpercent_T_5 = (covbases_T_5 / region) * 100
        covpercent_T_10 = (covbases_T_10 / region) * 100
        covpercent_T_15 = (covbases_T_15 / region) * 100
        covpercent_T_20 = (covbases_T_20 / region) * 100

        # Overall all coverage contribuited by all the bases
        covbases_1 = sum(sum(depth >= 1 for depth in depth_array) for depth_array in coverage)
        covbases_5 = sum(sum(depth >= 5 for depth in depth_array) for depth_array in coverage)
        covbases_10 = sum(sum(depth >= 10 for depth in depth_array) for depth_array in coverage)
        covbases_15 = sum(sum(depth >= 15 for depth in depth_array) for depth_array in coverage)
        covbases_20 = sum(sum(depth >= 20 for depth in depth_array) for depth_array in coverage)
        covbases_25 = sum(sum(depth >= 25 for depth in depth_array) for depth_array in coverage)
        covbases_30 = sum(sum(depth >= 30 for depth in depth_array) for depth_array in coverage)

        # coverage percent of all the bases
        covpercent_1 = (covbases_1 / region) * 100
        covpercent_5 = (covbases_5 / region) * 100
        covpercent_10 = (covbases_10 / region) * 100
        covpercent_15 = (covbases_15 / region) * 100
        covpercent_20 = (covbases_20 / region) * 100
        covpercent_25 = (covbases_25 / region) * 100
        covpercent_30 = (covbases_30 / region) * 100

        # Adjust covbases and covpercent if necessary
        covbases_1 = np.where(covbases_1 > region, region, covbases_1)
        covbases_5 = np.where(covbases_5 > region, region, covbases_5)
        covbases_10 = np.where(covbases_10 > region, region, covbases_10)
        covbases_15 = np.where(covbases_15 > region, region, covbases_15)
        covbases_20 = np.where(covbases_20 > region, region, covbases_20)
        covbases_25 = np.where(covbases_25 > region, region, covbases_25)

        covpercent_1 = np.where(covpercent_1 > 100, 100.0, covpercent_1)
        covpercent_5 = np.where(covpercent_5 > 100, 100.0, covpercent_5)
        covpercent_10 = np.where(covpercent_10 > 100, 100.0, covpercent_10)
        covpercent_15 = np.where(covpercent_15 > 100, 100.0, covpercent_15)
        covpercent_20 = np.where(covpercent_20 > 100, 100.0, covpercent_20)
        covpercent_25 = np.where(covpercent_25 > 100, 100.0, covpercent_25)

        # please paste here the coverage_data.append

        coverage_data.append((Region_name, gene_id_name, chrom, start, end,
                              covpercent_1, covpercent_5, covpercent_10,
                              covpercent_15, covpercent_20, covpercent_25,
                              covbases_1, covbases_5, covbases_10, covbases_15,
                              covbases_20, covbases_25, covpercent_G_1, covpercent_G_5,
                              covpercent_G_10, covpercent_G_15, covpercent_G_20,
                              covpercent_C_1, covpercent_C_5, covpercent_C_10,
                              covpercent_C_15, covpercent_C_20,
                              covpercent_T_1, covpercent_T_5, covpercent_T_10, covpercent_T_15,
                              covpercent_T_20, covpercent_A_1, covpercent_A_5, covpercent_A_10,
                              covpercent_A_15, covpercent_A_20,
                              covbases_A_1, covbases_A_5, covbases_A_10, covbases_A_15, covbases_A_20,
                              covbases_C_1, covbases_C_5, covbases_C_10, covbases_C_15, covbases_C_20,
                              covbases_G_1, covbases_G_5, covbases_G_10, covbases_G_15, covbases_G_20,
                              covbases_T_1, covbases_T_5, covbases_T_10, covbases_T_15, covbases_T_20,
                              coverage_A, coverage_C, coverage_G, coverage_T, num_reads_mapped_A, num_reads_mapped_C,
                              num_reads_mapped_G, num_reads_mapped_T
                              ))

    df = pd.DataFrame(coverage_data, columns=['Region_name', 'gene_id_name', 'chrom', 'start', 'end',
                                              'covpercent_depth_1', 'covpercent_depth_5', 'covpercent_depth_10',
                                              'covpercent_depth_15',
                                              'covpercent_depth_20', 'covpercent_depth_25',
                                              'covbases_depth_1', 'covbases_depth_5', 'covbases_depth_10',
                                              'covbases_depth_15',
                                              'covbases_depth_20', 'covbases_depth_25',
                                              'covpercent_depth_G_1', 'covpercent_depth_G_5', 'covpercent_depth_G_10',
                                              'covpercent_depth_G_15', 'covpercent_depth_G_20',
                                              'covpercent_depth_C_1', 'covpercent_depth_C_5', 'covpercent_depth_C_10',
                                              'covpercent_depth_C_15', 'covpercent_depth_C_20',
                                              'covpercent_depth_T_1', 'covpercent_depth_T_5', 'covpercent_depth_T_10',
                                              'covpercent_depth_T_15', 'covpercent_depth_T_20',
                                              'covpercent_depth_A_1', 'covpercent_depth_A_5', 'covpercent_depth_A_10',
                                              'covpercent_depth_A_15', 'covpercent_depth_A_20',
                                              'covbases_depth_A_1', 'covbases_depth_A_5', 'covbases_depth_A_10',
                                              'covbases_depth_A_15', 'covbases_depth_A_20',
                                              'covbases_depth_C_1', 'covbases_depth_C_5', 'covbases_depth_C_10',
                                              'covbases_depth_C_15', 'covbases_depth_C_20',
                                              'covbases_depth_G_1', 'covbases_depth_G_5', 'covbases_depth_G_10',
                                              'covbases_depth_G_15',
                                              'covbases_depth_G_20',
                                              'covbases_depth_T_1', 'covbases_depth_T_5', 'covbases_depth_T_10',
                                              'covbases_depth_T_15',
                                              'covbases_depth_T_20',
                                              'coverage_A_all', 'coverage_C_all', 'coverage_G_all', 'coverage_T_all',
                                              'num_reads_mapped_A', 'num_reads_mapped_C', 'num_reads_mapped_G',
                                              'num_reads_mapped_T'
                                              ])
    bam.close()

    print("Gene coverage calculation completed.")
    df.to_csv(bam_file.split('.')[0] + 'updated_gene_wise_coverage.csv')

    return df

directory = "/lustre/priti.devi/abuzar/DR/BAM/DR"
bam_files = [file_name for file_name in os.listdir(directory) if file_name.endswith(".bam")]
for bam_file in bam_files:
    bam_path = os.path.join(directory, bam_file)

    # Call the function x with the BAM and GTF file paths
    result_df = calculate_gene_coverage(bam_file, df)