In [2]:
import pysam
import time
import pandas as pd
import numpy as np
import subprocess
import pandas as pd

In [3]:
df = pd.read_csv("gtffile.csv", index_col = 0, low_memory = False)

In [4]:
def calculate_gene_coverage(bam_file, df):
    total_time_start = time.time()
    print("Indexing", {bam_file})
    index_start_time = time.time()
    subprocess.run(["samtools", "index", bam_file])
    index_end_time = time.time()
    print('Indexing take', index_end_time - index_start_time, "Seconds")
    print('Indexing Done!')
    print("Now Aligning", {bam_file})
    bam_proc_start = time.time()
    bam = pysam.AlignmentFile(bam_file, "rb")
    bam_proc_end = time.time()
    print({bam_file}, 'Aligning take',bam_proc_end - bam_proc_start, 'seconds' )
    print('I know you are running out of time but keep calm and Sit back we are working...')
    coverage_data = []
    start_count_coverage = time.time()
    print(f"Processing {bam_file}")
    
    
    for _, gene in df.iterrows():
        chrom = gene['Region']
        start = gene['Start']
        end = gene['End']
        region = end - start + 1
        coverage = bam.count_coverage(chrom, start, end)
        
        covbases = sum(sum(depth >= 1 for depth in depth_array) for depth_array in coverage)
        covpercent = (covbases / region) * 100

        gene_name = gene['Description'].split(";")[0].split(" ")[1].replace('"', '')
        
        # Adjust covbases and covpercent if necessary
        covbases = np.where(covbases > region, region, covbases)
        covpercent = np.where(covpercent > 100, 100.0, covpercent)
        
        coverage_data.append((gene_name, chrom, start, end, covbases, covpercent))
    end_count_coverage = time.time()
    print('Counting coverage took', end_count_coverage - start_count_coverage, 'seconds')
        
    df = pd.DataFrame(coverage_data, columns=['Gene', 'Chromosome', 'Start', 'End', 'CovBases', 'CovPercent'])
    bam.close()
    
    print("Gene coverage calculation completed.")
    total_time_end = time.time()
    print("Total time taken to execute this entire process :", total_time_end - total_time_start, 'seconds')
    df.to_csv(bam_file.split('.')[0] + 'OM_gene_wise_coverage.csv')


    return df



In [5]:
bam_file ='/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done/IGIB1130806804.bam'

In [7]:
df_om = calculate_gene_coverage(bam_file,df)

Indexing {'/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done/IGIB1130806804.bam'}
Indexing take 0.643240213394165 Seconds
Indexing Done!
Now Aligning {'/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done/IGIB1130806804.bam'}
{'/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done/IGIB1130806804.bam'} Aligning take 0.007635354995727539 seconds
I know you are running out of time but keep calm and Sit back we are working...
Processing /home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done/IGIB1130806804.bam
Counting coverage took 1855.3561754226685 seconds
Gene coverage calculation completed.
Total time taken to execute this entire process : 1857.587964296341 seconds


In [9]:
df_om.to_csv('IGIB1130806804_gene_Wise_coverage.csv')

In [15]:
#bam_file = "IGIB1130412732V.bam"
#result_df = calculate_gene_coverage(bam_file, df)
#result_df.to_csv('IGIB1130412732V_gene_wise_coverage.csv')
#

In [12]:
import os
directory = "/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done"
bam_files = [file_name for file_name in os.listdir(directory) if file_name.endswith(".bam")]




In [13]:
bam_files 

['IGIB1130371022V.bam',
 'IGIB113094002V.bam',
 'IGIB113050041V.bam',
 'IGIB1130519238V.bam',
 'IGIB113023885V.bam',
 'IGIB1130111V.bam']

In [None]:
for bam_file in bam_files:
    bam_path = os.path.join(directory, bam_file)

    # Call the function x with the BAM and GTF file paths
    result_df = calculate_gene_coverage(bam_file, df)


Indexing {'IGIB1130371022V.bam'}
Indexing take 8.262508869171143 Seconds
Indexing Done!
Now Aligning {'IGIB1130371022V.bam'}
{'IGIB1130371022V.bam'} Aligning take 0.004045009613037109 seconds
I know you are running out of time but keep calm and Sit back we are working...
Processing IGIB1130371022V.bam


In [20]:
result_df

Unnamed: 0,Gene,Chromosome,Start,End,CovBases,CovPercent
0,ENSG00000186827,1,1211340,1214153,583,20.717839374555794
1,ENSG00000186827,1,1211340,1214153,583,20.717839374555794
2,ENSG00000186827,1,1213983,1214153,0,0.0
3,ENSG00000186827,1,1213983,1214127,0,0.0
4,ENSG00000186827,1,1214125,1214127,0,0.0
...,...,...,...,...,...,...
3276509,ENSG00000277475,KI270713.1,32373,32528,127,81.41025641025641
3276510,ENSG00000277475,KI270713.1,31698,31841,0,0.0
3276511,ENSG00000275405,KI270713.1,21861,22024,141,85.97560975609755
3276512,ENSG00000275405,KI270713.1,21861,22024,141,85.97560975609755


In [19]:
os.getcwd()

'/home/abuzarkhan_123/Gene Wise Coverage Using Pysam/done'