In [1]:
import pandas as pd
import pysam
from intervaltree import Interval, IntervalTree
import os

In [2]:
transcript_file_path = "/data/private/pdutta/PSB_Data/all_tc_input.csv"
output_path = "/data/private/pdutta/VCF_data/MergedTransVCF"
dbsnp_path = "/data/private/pdutta/VCF_data/Merged_DBSNP_Chrwise"

In [3]:
df_transcript = pd.read_csv(transcript_file_path, sep =",")
df_transcript['chromosome_name'] = 'chr' + df_transcript['chromosome_name'].astype(str)
df_transcript

Unnamed: 0.1,Unnamed: 0,transcription_start_site,chromosome_name,transcript_start,transcript_end,strand,ensembl_gene_id,ensembl_transcript_id
0,7,22441458,chrY,22439593,22441458,-1,ENSG00000131538,ENST00000253838
1,8,6390431,chrY,6390431,6411564,1,ENSG00000129816,ENST00000250776
2,9,9774289,chrY,9753156,9774289,-1,ENSG00000129845,ENST00000250805
3,10,22144966,chrY,22144966,22146831,1,ENSG00000131548,ENST00000253848
4,11,11981507,chr18,11981507,12030877,1,ENSG00000141401,ENST00000269159
...,...,...,...,...,...,...,...,...
252697,274574,181732566,chr1,181732566,181767884,1,ENSG00000198216,ENST00000700189
252698,274575,181793668,chr1,181793668,181813262,1,ENSG00000198216,ENST00000700190
252699,274576,182329213,chr1,182278254,182329213,-1,ENSG00000228918,ENST00000702781
252700,274577,84621034,chr1,84614068,84621034,-1,ENSG00000289881,ENST00000701339


In [4]:
df_transcript.groupby(['chromosome_name'])['chromosome_name'].count()

chromosome_name
chr1     22477
chr10     9151
chr11    14817
chr12    13744
chr13     4678
chr14     9237
chr15     9118
chr16    11632
chr17    14689
chr18     4710
chr19    14424
chr2     18869
chr20     6063
chr21     3283
chr22     5353
chr3     15975
chr4     10361
chr5     11969
chr6     11961
chr7     12201
chr8     10300
chr9      8748
chrX      8102
chrY       840
Name: chromosome_name, dtype: int64

In [5]:
# Add a new column for mutations
df_transcript['mutations'] = ''

# Group the transcripts by chromosome
transcripts_by_chromosome = df_transcript.groupby('chromosome_name')

In [6]:
 transcripts_by_chromosome.get_group('chr1')

Unnamed: 0.1,Unnamed: 0,transcription_start_site,chromosome_name,transcript_start,transcript_end,strand,ensembl_gene_id,ensembl_transcript_id,mutations
322,337,3069203,chr1,3069203,3438621,1,ENSG00000142611,ENST00000270722,
391,406,12277121,chr1,12277121,12510361,1,ENSG00000048707,ENST00000011700,
466,481,27155125,chr1,27098809,27155125,-1,ENSG00000090020,ENST00000263980,
479,494,28233029,chr1,28200278,28233029,-1,ENSG00000126698,ENST00000263697,
484,499,28259518,chr1,28259518,28282491,1,ENSG00000130766,ENST00000253063,
...,...,...,...,...,...,...,...,...,...
252697,274574,181732566,chr1,181732566,181767884,1,ENSG00000198216,ENST00000700189,
252698,274575,181793668,chr1,181793668,181813262,1,ENSG00000198216,ENST00000700190,
252699,274576,182329213,chr1,182278254,182329213,-1,ENSG00000228918,ENST00000702781,
252700,274577,84621034,chr1,84614068,84621034,-1,ENSG00000289881,ENST00000701339,


In [None]:
# Iterate over all the DBSNP files
for DBSNP_file in os.listdir(dbsnp_path):
    # Construct the full path to the DBSNP file
    DBSNP_path = os.path.join(dbsnp_path, DBSNP_file)
    print(DBSNP_path)
    input()

    # Load the DBSNP data
    df_DBSNP = pd.read_csv(DBSNP_path)

    # Create an interval tree
    tree = IntervalTree()

    # Fill the tree with intervals from the DBSNP file
    for index, row in df_DBSNP.iterrows():
        # Check if the start and end positions are the same
        if row['start'] == row['end']:
            # Add a small amount to the end position
            tree[row['start']:row['end'] + 1] = row['name']
        else:
            tree[row['start']:row['end']] = row['name']

    # Extract the chromosome number from the DBSNP file name
    # Assumes the file name is in the format "DBSNP_chr#.csv"
    chr_name = DBSNP_file.split('_')[0]
    print("DBSNP Chrosome", chr_name)
    input()
    chr_transcripts = transcripts_by_chromosome.get_group(chr_name)
    # Iterate over the transcripts
    for index, row in chr_transcripts.iterrows():
        # Find the matching intervals in the DBSNP data
        matches = tree[row['start']:row['end']]

        # Add the mutation information to the transcript
        df_transcripts.at[index, 'mutations'] = ', '.join([str(match.data) for match in matches])
    df_transcripts.loc[df_transcripts['chr_name'] == chr_name].to_csv(f'{output_path}/merged_{chr_name}.csv', index=False)