In [None]:
!pip install pysam
!apt-get install samtools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
samtools is already the newest version (1.10-3).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


In [None]:
import pysam
import pandas as pd
import numpy as np
import subprocess
import time

In [None]:
df = pd.read_csv('/content/drive/MyDrive/gene wise coverage/gtffile.csv', index_col = 0, low_memory = False)

In [None]:
df.head()

Unnamed: 0,Region,Source,Region_name,Start,End,Description,Region_length
0,1,ensembl_havana,gene,1211340,1214153,"gene_id ""ENSG00000186827""; gene_version ""11""; ...",2813
1,1,ensembl_havana,transcript,1211340,1214153,"gene_id ""ENSG00000186827""; gene_version ""11""; ...",2813
2,1,ensembl_havana,exon,1213983,1214153,"gene_id ""ENSG00000186827""; gene_version ""11""; ...",170
3,1,ensembl_havana,CDS,1213983,1214127,"gene_id ""ENSG00000186827""; gene_version ""11""; ...",144
4,1,ensembl_havana,start_codon,1214125,1214127,"gene_id ""ENSG00000186827""; gene_version ""11""; ...",2


In [6]:
def calculate_gene_coverage(bam_file, df):
    total_time_start = time.time()
    print("Indexing BAM file...")
    index_start_time = time.time()
    subprocess.run(["samtools", "index", bam_file])
    index_end_time = time.time()
    print('Indexing took', index_end_time - index_start_time, "Seconds")
    print('Smile Indexing Done...')
    print("Now  Aligning BAM file...")
    bam_proc_start = time.time()
    bam = pysam.AlignmentFile(bam_file, "rb")
    bam_proc_end = time.time()
    print('BAM aligning took',bam_proc_end - bam_proc_start, 'seconds' )
    print('Sit back and relax we are working...')
    coverage_data = []
    start_count_coverage = time.time()
    
    
    for _, gene in df.iterrows():
        chrom = gene['Region']
        start = gene['Start']
        end = gene['End']
        region = end - start + 1
        coverage = bam.count_coverage(chrom, start, end)
        
        covbases = sum(sum(depth >= 1 for depth in depth_array) for depth_array in coverage)
        covpercent = (covbases / region) * 100

        gene_name = gene['Description'].split(";")[0].split(" ")[1].replace('"', '')
        
        # Adjust covbases and covpercent if necessary
        covbases = np.where(covbases > region, region, covbases)
        covpercent = np.where(covpercent > 100, 100.0, covpercent)
        
        coverage_data.append((gene_name, chrom, start, end, covbases, covpercent))
    end_count_coverage = time.time()
    print('Counting coverage took', end_count_coverage - start_count_coverage, 'seconds')
        
    df = pd.DataFrame(coverage_data, columns=['Gene', 'Chromosome', 'Start', 'End', 'CovBases', 'CovPercent'])
    bam.close()
    
    print("Gene coverage calculation completed.")
    total_time_end = time.time()
    print("Total time taken to execute this entire process :", total_time_end - total_time_start, 'seconds')


    return df



Indexing BAM file...
Indexing took 17.26744771003723 Seconds
Smile Indexing Done...
Now  Aligning BAM file...
BAM aligning took 0.06750249862670898 seconds
Sit back and relax we are working...
Counting coverage took 4646.558126926422 seconds
Gene coverage calculation completed.
Total time taken to execute this entire process : 4666.788514852524 seconds


In [None]:
bam_file = "/path_to_file.bam"

result_df = calculate_gene_coverage(bam_file, df)


In [7]:
result_df.to_csv('your_desired_file_name.csv')

In [8]:
result_df


Unnamed: 0,Gene,Chromosome,Start,End,CovBases,CovPercent
0,ENSG00000186827,1,1211340,1214153,623,22.139303482587064
1,ENSG00000186827,1,1211340,1214153,623,22.139303482587064
2,ENSG00000186827,1,1213983,1214153,102,59.64912280701754
3,ENSG00000186827,1,1213983,1214127,76,52.41379310344828
4,ENSG00000186827,1,1214125,1214127,2,66.66666666666666
...,...,...,...,...,...,...
3276509,ENSG00000277475,KI270713.1,32373,32528,38,24.358974358974358
3276510,ENSG00000277475,KI270713.1,31698,31841,27,18.75
3276511,ENSG00000275405,KI270713.1,21861,22024,163,99.39024390243902
3276512,ENSG00000275405,KI270713.1,21861,22024,163,99.39024390243902
