In [1]:
import os, glob
import pandas as pd
import pysam
import numpy as np
import pickle

In [2]:
cancer_type = "Brain"
intersected_base_path = "/data/private/pdutta_new/GDC_cancer_wise/{}/Generated_files/Intersected_files/100bp_TFBS/TFBS_test".format(cancer_type)
reference_genome_path="/data/projects/Resources/Gencode_genome_annotation/GRCh38.primary_assembly.genome.fa"

In [3]:
output_path = "/data/projects/GDC_Cancer_Wise/New_data/{}/Generated_files/DNABERT_Data/Somatic/TFBS_300bp".format(cancer_type)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [4]:
top_models_df= pd.read_csv("/home/campus.stonybrook.edu/pdutta/Github/Postdoc/DNABERT_data_processing/TFBS/important_tfbs.txt", sep="\t", header=None)
top_models_df

Unnamed: 0,0
0,FOS
1,H2AFZ
2,H2AK5ac
3,H2BK12ac
4,H2BK5ac
5,H3F3A
6,H3K14ac
7,H3K27ac
8,H3K27me3
9,H3K36me3


In [5]:
top_models_df = pd.DataFrame({0: ['CBFA2T3', 'POLR2A']})

top_models_df

Unnamed: 0,0
0,CBFA2T3
1,POLR2A


In [6]:
def seq2kmer(seq):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    k=6
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [7]:
def get_sequences(df, reference_fasta):
    data = []
    for idx, row in df.iterrows():
        #print(row)
        chrom = row[0]
        ref_start = row[1]
        ref_end = row[2]
        variant_start = row['START_POS']
        variant_end = row['END_POS']
        ref_nucleotide = row['REF']
        alt = row["ALT"]
        
        # Adjust for 0-based indexing in python
        variant_pos_start = variant_start - ref_start
        variant_pos_end = variant_end - ref_start
        #print(ref_nucleotide , alt, variant_pos_start, variant_pos_end)
        
        
        # Get reference sequence
        #print(chrom)
        ref_seq = reference_fasta.fetch(chrom, ref_start, ref_end)
        #print(ref_seq)
        
        # Handle insertion and deletion to get the correct alt sequence
        # Identify if the variant is an insertion or deletion
        if len(ref_nucleotide) < len(alt):  # Insertion
            delete_size =  len(alt) - len(ref_nucleotide)
            #print(variant_pos_start, variant_pos_end ,delete_size)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:len(ref_seq) - delete_size]

        elif len(ref_nucleotide) > len(alt):  # Deletion
            insert_size = len(ref_nucleotide) - len(alt)
            #print(insert_size)
            extra_bases = reference_fasta.fetch(chrom, ref_end, ref_end + insert_size)
            #print(extra_bases)
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:] + extra_bases

        else:  # SNV
            alt_seq = ref_seq[:variant_pos_start] + alt + ref_seq[variant_pos_end:]
        # print(ref_seq)
        # print(alt_seq)
        # input()


        data.append({
            'chr': chrom,
            #'strand': row['strand'],
            #'Transcript_ID': row['transcript_id'],
            'Chip_Seq_start': row[1],
            'Chip_Seq_end': row[2],
            'varinat_start': variant_start,
            'variant_end': variant_end,
            'ref_neucleotide': ref_nucleotide,
            'alternative_neucleotide': alt,
            'reference_seq': ref_seq,
            'alt_seq': alt_seq
        })
            
    
    # Convert the list of dictionaries to a DataFrame
    new_df = pd.DataFrame(data)
    # print(new_df.shape)
    # print(new_df)
    new_df = new_df.drop_duplicates().reset_index()
    #print(new_df.shape)
    #print("&*")
    data = []
    merged_list = list(zip(new_df['reference_seq'], new_df['alt_seq']))
    merged_list = [item.upper() for tup in merged_list for item in tup]
    #print(merged_list)
    kmer_lst = list(map(seq2kmer, merged_list))
    df_kmer = pd.DataFrame(kmer_lst, columns=['Sequence'])
    df_kmer['Label'] = np.random.choice([0, 1], size=len(df_kmer))
    #print(df_kmer.shape)
    return new_df, df_kmer

In [20]:
# def apply_mutations_sorted(ref_seq, starts, ends, refs, alts):
#     mutations = sorted(zip(starts, ends, refs, alts), key=lambda x: x[0])  # Sort by start position
#     seq_list = list(ref_seq)
#     print(mutations)
#     print(seq_list)
#     # Apply each mutation
#     for start, end, ref, alt in mutations:
#         pos = start - 1  # Adjust for 0-based index
#         print(seq_list[pos:pos+len(ref)])
#         print(pos)
#         if ''.join(seq_list[pos:pos+len(ref)]) == ref:  # Check if the reference matches
#             seq_list[pos:pos+len(ref)] = list(alt)  # Replace with the alternative
#         else:
#             # If the reference doesn't match, log an error or handle it accordingly
#             print(f"Expected ref '{ref}' at position {start}, but found '{''.join(seq_list[pos:pos+len(ref)])}'")
    
#     print(''.join(seq_list))
#     input()
#     return ''.join(seq_list)  # Convert list back to string

def apply_mutations_absolute(ref_seq, chip_start, starts, ends, refs, alts):
    mutations = sorted(zip(starts, ends, refs, alts), key=lambda x: x[0])  # Sort by start position
    seq_list = list(ref_seq)
    offset = chip_start  # This is the absolute starting position of the sequence on the chromosome
    print(mutations, seq_list, offset)
    input()

    # Apply each mutation
    for start, end, ref, alt in mutations:
        rel_pos = start - offset  # Calculate the relative position in the sequence
        print(rel_pos)
        if ''.join(seq_list[rel_pos:rel_pos+len(ref)]) == ref:  # Check if the reference matches
            seq_list[rel_pos:rel_pos+len(ref)] = list(alt)  # Replace with the alternative
        else:
            # If the reference doesn't match, log an error or handle it accordingly
            print(f"Expected ref '{ref}' at position {rel_pos}, but found '{''.join(seq_list[rel_pos:rel_pos+len(ref)])}'")
    print(''.join(seq_list))
    input()

    return ''.join(seq_list)  # Convert list back to string

In [21]:
## Load the reference file
reference_fasta = pysam.FastaFile(reference_genome_path)
print("#####The reference file is loaded######")

#####The reference file is loaded######


In [None]:
missing_files = []



for index, row in top_models_df.iterrows():
    intersected_data = f"{intersected_base_path}/{row[0]}/intersected_vcf_data.pkl"
    print(intersected_data)
    
    try:
        with open(intersected_data, "rb") as file:
            loaded_dictionary = pickle.load(file)
        print(f"****All the VCF files are loaded for {row[0]}***") 

        dnabert_raw_data = {}
        for key, value in loaded_dictionary.items():
            new_df, df_kmer = get_sequences(value, reference_fasta)
            dnabert_raw_data[key] = new_df
            display(df_kmer)
            display(new_df)
            grouped_df = new_df.groupby(['chr', 'Chip_Seq_start', 'Chip_Seq_end']).agg({
                                    'varinat_start': lambda x: list(set(x)),  # Get unique starts
                                    'variant_end': lambda x: list(set(x)),    # Get unique ends
                                    'ref_neucleotide': lambda x: list(set(x)),  # Unique reference nucleotides
                                    'alternative_neucleotide': lambda x: list(set(x)),  # Unique alternative nucleotides
                                    'reference_seq': 'first',  # Assuming you might want just the first sequence
                                    'alt_seq':  lambda x: list(set(x))  # Assuming you might want all the alt sequence
                                }).reset_index()

            display(grouped_df)
            # Apply mutations
            # Assuming grouped_df is already created and sorted properly
            # Modify your grouped DataFrame construction and apply the new function
            grouped_df['mutated_sequence'] = grouped_df.apply(lambda row: apply_mutations_absolute(
                row['reference_seq'], row['Chip_Seq_start'], row['varinat_start'],
                row['variant_end'], row['ref_neucleotide'], row['alternative_neucleotide']), axis=1)
            display(grouped_df)
            input()

            out_folder_path = f"{output_path}/{row[0]}/Patient_wise/{key}"
            if not os.path.exists(out_folder_path):
                os.makedirs(out_folder_path)
            df_kmer.to_csv(out_folder_path + "/dev.tsv", sep="\t", index=False)
        
        with open(f"{output_path}/{row[0]}/raw_vcf_data.pkl", "wb") as file:
            pickle.dump(dnabert_raw_data, file)

    except FileNotFoundError:
        print(f"File not found for {row[0]}, skipping...")
        missing_files.append(row[0])

# Optionally, save the list of missing files/tags to a text file
missing_files_path = f"{output_path}/missing_files.txt"
with open(missing_files_path, "w") as f:
    for tag in missing_files:
        f.write(f"{tag}\n")

print(f"Missing files recorded at {missing_files_path}")

/data/private/pdutta_new/GDC_cancer_wise/Brain/Generated_files/Intersected_files/100bp_TFBS/TFBS_test/CBFA2T3/intersected_vcf_data.pkl
****All the VCF files are loaded for CBFA2T3***


Unnamed: 0,Sequence,Label
0,TGCATC GCATCC CATCCT ATCCTC TCCTCA CCTCAG CTCA...,0
1,TGCATT GCATTC CATTCT ATTCTC TTCTCA TCTCAG CTCA...,0
2,GGCCGA GCCGAG CCGAGT CGAGTC GAGTCT AGTCTC GTCT...,0
3,GGCCGA GCCGAG CCGAGT CGAGTC GAGTCT AGTCTC GTCT...,1
4,GGCCGA GCCGAG CCGAGT CGAGTC GAGTCT AGTCTC GTCT...,0
...,...,...
215,CTAATC TAATCT AATCTT ATCTTG TCTTGA CTTGAG TTGA...,1
216,CTAATC TAATCT AATCTT ATCTTG TCTTGA CTTGAG TTGA...,0
217,CTAATC TAATCT AATCTT ATCTTG TCTTGA CTTGAG TTGA...,0
218,GGCGAT GCGATG CGATGG GATGGT ATGGTA TGGTAG GGTA...,1


Unnamed: 0,index,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,0,chr1,54707858,54707958,54707863,54707864,C,T,TGCATCCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...,TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...
1,1,chr5,415490,415590,415508,415509,G,T,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCTGGTGGGAGGCCTAGGGGCCGAGTCTGC...
2,2,chr5,415490,415590,415510,415511,G,C,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCGGCTGGGAGGCCTAGGGGCCGAGTCTGC...
3,3,chr5,415490,415590,415564,415565,A,G,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...,GGCCGAGTCTCCCTGGTCGGGTGGGAGGCCTAGGGGCCGAGTCTGC...
4,4,chr16,89068861,89068961,89068902,89068903,A,G,TGGTGATGATGATGGTGGTGATGATGATGGTGGTGATGGTGATGAT...,TGGTGATGATGATGGTGGTGATGATGATGGTGGTGATGGTGGTGAT...
...,...,...,...,...,...,...,...,...,...,...
105,105,chr19,41789861,41789962,41789887,41789888,G,A,AGAAAGAAAGAAAGAAAGAAAGAAAAGAAAGAAAGAAAGAAAGAAA...,AGAAAGAAAGAAAGAAAGAAAGAAAAAAAAGAAAGAAAGAAAGAAA...
106,106,chr13,57431125,57431226,57431173,57431174,C,T,CTGAAGGAACATACCGTAAGATAAGAAGAATCATATATGTCAGACC...,CTGAAGGAACATACCGTAAGATAAGAAGAATCATATATGTCAGACC...
107,107,chr20,29508463,29508564,29508498,29508499,G,A,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCCTTGGGCTC...,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTACCTTGGGCTC...
108,108,chr20,29508463,29508564,29508500,29508501,C,G,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCCTTGGGCTC...,CTAATCTTGAGCTCCTGGCCTCTAGTGATTCTCCTGCGTTGGGCTC...


Unnamed: 0,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq
0,chr1,16732666,16732766,[16732746],[16732747],[G],[T],GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTTT...,[GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTT...
1,chr1,54707858,54707958,[54707863],[54707864],[C],[T],TGCATCCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...,[TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCAC...
2,chr1,111040309,111040410,[111040379],[111040380],[C],[T],TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACC...,[TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGAC...
3,chr1,121657530,121657631,[121657570],[121657571],[G],[C],AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTGTGTTT...,[AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTCTGTT...
4,chr1,123086175,123086276,"[123086218, 123086260, 123086246]","[123086219, 123086261, 123086247]","[T, A]","[G, C, A]",ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...,[ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTG...
...,...,...,...,...,...,...,...,...,...
79,chr9,106762858,106762959,[106762872],[106762873],[T],[C],TTAGGAGAAAGAAGTAGGGGGAAGCTTAGTACTTGGAGGGAGCAGT...,[TTAGGAGAAAGAAGCAGGGGGAAGCTTAGTACTTGGAGGGAGCAG...
80,chr9,130633198,130633299,[130633242],[130633243],[T],[C],CTTGCATGCTCTGAAATTGTATAGTTGTGTTTCTTCCCTTTTTTTC...,[CTTGCATGCTCTGAAATTGTATAGTTGTGTTTCTTCCCTTTTTTC...
81,chrX,1348653,1348754,[1348693],[1348694],[T],[C],CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTTCTTT...,[CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCTT...
82,chrX,1570146,1570247,[1570207],[1570208],[C],[A],TGCAGTGAGCCAAGTTTGCACTGATGCACTCCAGCCTGGGTGACAG...,[TGCAGTGAGCCAAGTTTGCACTGATGCACTCCAGCCTGGGTGACA...


[(16732746, 16732747, 'G', 'T')] ['G', 'G', 'A', 'A', 'G', 'A', 'A', 'G', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'C', 'C', 'T', 'G', 'T', 'G', 'G', 'C', 'A', 'A', 'T', 'T', 'G', 'G', 'A', 'A', 'C', 'A', 'T', 'A', 'G', 'T', 'T', 'T', 'A', 'T', 'T', 'C', 'T', 'T', 'T', 'A', 'A', 'G', 'T', 'T', 'G', 'A', 'G', 'G', 'G', 'G', 'C', 'G', 'C', 'A', 'T', 'G', 'A', 'C', 'T', 'C', 'C', 'A', 'C', 'C', 'T', 'T', 'G', 'C', 'C', 'T', 'G', 'G', 'C', 'G', 'T', 'G', 'G', 'C', 'C', 'T', 'T', 'A', 'G', 'G', 'T', 'C', 'T', 'C', 'G', 'T', 'T', 'T', 'A'] 16732666


 


80
GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTTTAAGTTGAGGGGCGCATGACTCCACCTTGCCTGGCTTGGCCTTAGGTCTCGTTTA


 


[(54707863, 54707864, 'C', 'T')] ['T', 'G', 'C', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'G', 'A', 'T', 'G', 'G', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'G', 'A', 'T', 'C', 'C', 'T', 'T', 'G', 'C', 'C', 'C', 'T', 'C', 'C', 'C', 'A', 'C', 'T', 'G', 'T', 'C', 'A', 'C', 'C', 'C', 'C', 'T', 'C', 'A', 'G', 'T', 'G', 'G', 'A', 'T', 'C', 'T', 'A', 'A', 'G', 'A', 'T', 'T', 'A', 'A', 'G', 'T', 'A', 'A', 'C', 'T', 'T', 'C', 'T', 'G', 'G', 'T', 'C', 'T', 'G', 'G', 'T', 'G', 'G', 'G', 'A', 'G', 'A', 'T', 'A', 'A', 'A', 'A', 'T', 'T', 'C', 'A', 'T'] 54707858


 


5
TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACCCCTCAGTGGATCTAAGATTAAGTAACTTCTGGTCTGGTGGGAGATAAAATTCAT


 


[(111040379, 111040380, 'C', 'T')] ['T', 'T', 'G', 'G', 'G', 'A', 'G', 'G', 'C', 'C', 'A', 'A', 'G', 'G', 'C', 'G', 'G', 'G', 'C', 'G', 'G', 'A', 'T', 'C', 'A', 'C', 'G', 'A', 'G', 'G', 'T', 'C', 'A', 'G', 'G', 'A', 'G', 'A', 'T', 'C', 'G', 'A', 'G', 'A', 'C', 'C', 'C', 'T', 'C', 'C', 'T', 'G', 'G', 'C', 'T', 'A', 'A', 'C', 'A', 'C', 'G', 'G', 'T', 'G', 'A', 'A', 'A', 'C', 'C', 'C', 'C', 'G', 'T', 'C', 'T', 'C', 'T', 'A', 'C', 'T', 'A', 'A', 'G', 'A', 'A', 'T', 'A', 'C', 'A', 'A', 'A', 'A', 'A', 'A', 'T', 'T', 'A', 'G', 'C', 'C', 'G'] 111040309


 


70
TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACCCTCCTGGCTAACACGGTGAAACCCTGTCTCTACTAAGAATACAAAAAATTAGCCG


 


[(121657570, 121657571, 'G', 'C')] ['A', 'G', 'A', 'G', 'A', 'G', 'G', 'T', 'C', 'C', 'A', 'A', 'A', 'T', 'A', 'T', 'C', 'C', 'A', 'C', 'C', 'T', 'G', 'C', 'A', 'G', 'A', 'G', 'T', 'C', 'T', 'A', 'C', 'A', 'A', 'A', 'A', 'A', 'G', 'T', 'G', 'T', 'G', 'T', 'T', 'T', 'C', 'A', 'A', 'A', 'A', 'C', 'T', 'G', 'C', 'T', 'C', 'C', 'A', 'C', 'C', 'C', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'T', 'G', 'T', 'T', 'C', 'A', 'G', 'C', 'T', 'C', 'T', 'G', 'T', 'G', 'A', 'G', 'T', 'T', 'G', 'A', 'A', 'C', 'T', 'C', 'A', 'A', 'T', 'C', 'A', 'T', 'C', 'C'] 121657530


 


40
AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTCTGTTTCAAAACTGCTCCACCCAAAGGAATGTTCAGCTCTGTGAGTTGAACTCAATCATCC


 


[(123086218, 123086219, 'T', 'G'), (123086260, 123086261, 'A', 'C')] ['A', 'C', 'A', 'G', 'A', 'G', 'A', 'G', 'C', 'A', 'G', 'A', 'C', 'T', 'T', 'G', 'A', 'A', 'A', 'C', 'A', 'C', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'G', 'T', 'G', 'G', 'A', 'A', 'T', 'T', 'T', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'A', 'G', 'C', 'C', 'G', 'C', 'T', 'T', 'T', 'G', 'A', 'G', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'G', 'T', 'A', 'G', 'A', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'A', 'T', 'A', 'A', 'C', 'T', 'T', 'C', 'G', 'T', 'A', 'G', 'A', 'A', 'A', 'A', 'A'] 123086175


 


43
85
Expected ref 'A' at position 85, but found 'T'
ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGGGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA


 


[(123693931, 123693932, 'G', 'C')] ['C', 'T', 'T', 'T', 'T', 'T', 'G', 'T', 'G', 'G', 'A', 'A', 'T', 'T', 'T', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'A', 'A', 'G', 'C', 'G', 'C', 'T', 'T', 'T', 'G', 'A', 'G', 'G', 'C', 'C', 'A', 'A', 'A', 'G', 'G', 'C', 'A', 'G', 'A', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'A', 'T', 'A', 'T', 'C', 'T', 'T', 'C', 'G', 'T', 'T', 'T', 'C', 'A', 'A', 'A', 'A', 'C', 'T', 'A', 'G', 'A', 'C', 'A', 'G', 'A', 'A', 'T', 'C', 'A', 'T', 'T', 'C', 'T', 'C', 'A', 'G', 'A', 'A', 'A'] 123693892


 


39
CTTTTTGTGGAATTTGCAAGTGGAGATTTCAAGCGCTTTCAGGCCAAAGGCAGAAAAGGAAATATCTTCGTTTCAAAACTAGACAGAATCATTCTCAGAAA


 


[(179350700, 179350701, 'A', 'T')] ['T', 'T', 'G', 'A', 'A', 'G', 'T', 'A', 'G', 'G', 'G', 'A', 'A', 'A', 'T', 'G', 'A', 'A', 'C', 'T', 'A', 'G', 'G', 'T', 'A', 'A', 'C', 'C', 'T', 'C', 'A', 'T', 'G', 'A', 'A', 'G', 'A', 'C', 'C', 'A', 'T', 'T', 'G', 'A', 'T', 'A', 'A', 'C', 'A', 'C', 'T', 'G', 'A', 'T', 'T', 'T', 'G', 'A', 'T', 'A', 'G', 'G', 'T', 'C', 'A', 'T', 'A', 'A', 'A', 'A', 'G', 'T', 'T', 'A', 'T', 'C', 'T', 'A', 'T', 'A', 'T', 'A', 'A', 'A', 'C', 'A', 'G', 'C', 'T', 'T', 'T', 'C', 'C', 'C', 'A', 'T', 'T', 'C', 'G', 'T'] 179350623


 


77
TTGAAGTAGGGAAATGAACTAGGTAACCTCATGAAGACCATTGATAACACTGATTTGATAGGTCATAAAAGTTATCTTTATAAACAGCTTTCCCATTCGT


 


[(182949844, 182949845, 'G', 'T')] ['T', 'T', 'T', 'A', 'C', 'A', 'T', 'T', 'C', 'C', 'T', 'T', 'T', 'G', 'T', 'A', 'A', 'T', 'C', 'T', 'C', 'T', 'G', 'T', 'G', 'A', 'C', 'T', 'G', 'A', 'A', 'A', 'A', 'T', 'A', 'T', 'G', 'G', 'T', 'A', 'T', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'G', 'A', 'G', 'A', 'T', 'G', 'G', 'G', 'G', 'C', 'C', 'T', 'C', 'A', 'C', 'T', 'A', 'T', 'G', 'T', 'C', 'A', 'C', 'C', 'C', 'A', 'G', 'G', 'C', 'T', 'C', 'A', 'C', 'T', 'G', 'C', 'A', 'A', 'C', 'C', 'T'] 182949784


 


60
TTTACATTCCTTTGTAATCTCTGTGACTGAAAATATGGTATTCTTTTTTTTTTTTTTTTTTAGATGGGGCCTCACTATGTCACCCAGGCTCACTGCAACCT


 


[(236097312, 236097313, 'T', 'C')] ['A', 'A', 'T', 'G', 'G', 'A', 'G', 'C', 'A', 'G', 'G', 'C', 'G', 'G', 'C', 'C', 'A', 'G', 'G', 'G', 'G', 'T', 'G', 'A', 'C', 'T', 'C', 'A', 'G', 'A', 'A', 'T', 'G', 'G', 'A', 'G', 'C', 'A', 'G', 'G', 'C', 'G', 'G', 'C', 'C', 'A', 'G', 'G', 'G', 'G', 'T', 'G', 'A', 'C', 'T', 'C', 'A', 'G', 'A', 'A', 'T', 'G', 'G', 'A', 'G', 'C', 'A', 'G', 'G', 'T', 'G', 'G', 'C', 'C', 'A', 'G', 'G', 'G', 'G', 'T', 'G', 'A', 'C', 'T', 'C', 'A', 'G', 'A', 'A', 'T', 'G', 'G', 'A', 'G', 'C', 'A', 'G', 'G', 'T', 'G'] 236097214


 


98
AATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGCGGCCAGGGGTGACTCAGAATGGAGCAGGTGGCCAGGGGTGACTCAGAATGGAGCAGGCG


 


[(38647446, 38647447, 'G', 'A')] ['A', 'C', 'T', 'T', 'C', 'C', 'C', 'C', 'A', 'A', 'T', 'T', 'A', 'A', 'A', 'A', 'G', 'G', 'T', 'T', 'A', 'C', 'A', 'G', 'G', 'A', 'A', 'A', 'G', 'G', 'C', 'C', 'C', 'C', 'T', 'A', 'T', 'T', 'G', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'T', 'A', 'C', 'T', 'C', 'T', 'G', 'G', 'C', 'T', 'C', 'T', 'T', 'C', 'T', 'C', 'C', 'A', 'G', 'G', 'A', 'A', 'T', 'T', 'T', 'G', 'G', 'A', 'T', 'G', 'A', 'A', 'G', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'A', 'G', 'T', 'C', 'T', 'C', 'A', 'G', 'T', 'C', 'T', 'G', 'G', 'G'] 38647347


 


99
ACTTCCCCAATTAAAAGGTTACAGGAAAGGCCCCTATTGGCCTACATTACTCTGGCTCTTCTCCAGGAATTTGGATGAAGAGATTTCAGTCTCAGTCTGAG


 


[(46844260, 46844261, 'T', 'A')] ['C', 'C', 'A', 'C', 'G', 'G', 'T', 'G', 'G', 'C', 'T', 'C', 'A', 'T', 'G', 'C', 'C', 'T', 'G', 'T', 'A', 'A', 'T', 'C', 'C', 'C', 'A', 'G', 'C', 'A', 'C', 'T', 'T', 'T', 'G', 'G', 'G', 'A', 'G', 'G', 'C', 'C', 'G', 'A', 'G', 'G', 'C', 'A', 'G', 'G', 'C', 'G', 'G', 'A', 'T', 'C', 'T', 'T', 'G', 'G', 'G', 'G', 'T', 'C', 'A', 'G', 'G', 'A', 'G', 'A', 'T', 'C', 'A', 'A', 'G', 'A', 'C', 'C', 'A', 'T', 'C', 'C', 'T', 'G', 'C', 'C', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'G', 'T', 'G', 'A', 'A', 'A', 'C', 'C'] 46844204


 


56
CCACGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCAGGCGGATCATGGGGTCAGGAGATCAAGACCATCCTGCCCAACATGGTGAAACC


 


[(126678985, 126678986, 'C', 'A')] ['C', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'C', 'C', 'T', 'A', 'A', 'T', 'A', 'G', 'A', 'C', 'A', 'T', 'C', 'T', 'A', 'C', 'A', 'G', 'A', 'A', 'C', 'T', 'C', 'T', 'C', 'C', 'A', 'C', 'C', 'C', 'C', 'A', 'A', 'A', 'T', 'C', 'A', 'A', 'C', 'A', 'G', 'A', 'A', 'T', 'A', 'T', 'A', 'C', 'A', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'C', 'A', 'G', 'C', 'A', 'C', 'C', 'A', 'C', 'A', 'C', 'C', 'A', 'C', 'A', 'C', 'C', 'T', 'A', 'T', 'T', 'C', 'C', 'A', 'A', 'A', 'A', 'T', 'T', 'G', 'A', 'C', 'C', 'A', 'C', 'A'] 126678947


 


38
CCAAGTGGACCTAATAGACATCTACAGAACTCTCCACCACAAATCAACAGAATATACATTTTTTTCAGCACCACACCACACCTATTCCAAAATTGACCACA


 


[(3509724, 3509725, 'T', 'A')] ['G', 'C', 'G', 'T', 'T', 'A', 'T', 'C', 'A', 'G', 'T', 'T', 'G', 'G', 'A', 'T', 'G', 'A', 'A', 'T', 'T', 'C', 'C', 'T', 'G', 'G', 'G', 'A', 'A', 'C', 'T', 'G', 'C', 'A', 'G', 'A', 'T', 'A', 'T', 'T', 'G', 'C', 'T', 'C', 'G', 'C', 'C', 'A', 'C', 'A', 'G', 'T', 'A', 'T', 'C', 'T', 'T', 'A', 'T', 'C', 'A', 'G', 'T', 'T', 'A', 'A', 'T', 'T', 'G', 'C', 'A', 'T', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'T', 'G', 'T', 'G', 'C', 'T', 'G', 'G', 'G', 'A', 'G', 'T', 'C', 'A', 'G', 'C', 'T', 'T', 'G', 'C', 'A'] 3509634


 


90
GCGTTATCAGTTGGATGAATTCCTGGGAACTGCAGATATTGCTCGCCACAGTATCTTATCAGTTAATTGCATTCTTCGATGTGCTGGGAGACAGCTTGCA


 


[(127340841, 127340842, 'C', 'T')] ['A', 'A', 'A', 'C', 'A', 'A', 'C', 'T', 'C', 'A', 'C', 'T', 'C', 'C', 'T', 'G', 'G', 'C', 'C', 'A', 'G', 'C', 'A', 'A', 'T', 'C', 'T', 'G', 'T', 'C', 'T', 'T', 'C', 'T', 'C', 'A', 'T', 'T', 'T', 'T', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'A', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'A', 'C', 'C', 'C', 'T', 'A', 'T', 'C', 'C', 'T', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'C', 'T', 'A', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'T', 'T', 'A', 'T', 'T', 'G', 'A', 'G', 'C', 'A', 'C', 'T', 'T', 'A', 'C', 'C'] 127340800


 


41
AAACAACTCACTCCTGGCCAGCAATCTGTCTTCTCATTTTTTCCCCCATCCTTCTACCCTATCCTTCTTTTTCTACCTTCAATATTTATTGAGCACTTACC


 


[(54210015, 54210016, 'G', 'A')] ['T', 'G', 'A', 'G', 'G', 'G', 'C', 'G', 'A', 'G', 'A', 'A', 'G', 'A', 'G', 'G', 'T', 'T', 'A', 'A', 'T', 'T', 'G', 'A', 'G', 'T', 'C', 'T', 'G', 'T', 'C', 'T', 'G', 'C', 'A', 'G', 'G', 'C', 'A', 'A', 'G', 'A', 'T', 'A', 'A', 'G', 'G', 'C', 'T', 'C', 'C', 'C', 'A', 'C', 'A', 'G', 'A', 'G', 'C', 'T', 'G', 'G', 'A', 'G', 'C', 'T', 'A', 'G', 'T', 'T', 'C', 'C', 'C', 'T', 'G', 'G', 'A', 'G', 'T', 'G', 'T', 'C', 'T', 'T', 'C', 'T', 'A', 'G', 'A', 'T', 'A', 'A', 'C', 'A', 'C', 'T', 'C', 'A', 'G', 'A'] 54210008


 


7
TGAGGGCAAGAAGAGGTTAATTGAGTCTGTCTGCAGGCAAGATAAGGCTCCCACAGAGCTGGAGCTAGTTCCCTGGAGTGTCTTCTAGATAACACTCAGA


 


[(129549371, 129549372, 'C', 'T')] ['T', 'C', 'C', 'A', 'C', 'G', 'T', 'G', 'T', 'T', 'G', 'T', 'G', 'G', 'G', 'A', 'A', 'G', 'G', 'A', 'C', 'C', 'T', 'G', 'G', 'T', 'G', 'G', 'G', 'A', 'G', 'A', 'T', 'A', 'A', 'G', 'T', 'G', 'A', 'A', 'T', 'C', 'A', 'T', 'G', 'G', 'G', 'G', 'T', 'G', 'G', 'T', 'T', 'T', 'C', 'C', 'C', 'C', 'C', 'C', 'T', 'C', 'C', 'T', 'G', 'T', 'T', 'C', 'T', 'C', 'A', 'T', 'G', 'G', 'T', 'A', 'G', 'T', 'G', 'A', 'A', 'T', 'A', 'A', 'G', 'T', 'C', 'T', 'C', 'A', 'T', 'G', 'G', 'G', 'A', 'T', 'C', 'T', 'G', 'A', 'T'] 129549367


 


4
TCCATGTGTTGTGGGAAGGACCTGGTGGGAGATAAGTGAATCATGGGGTGGTTTCCCCCCTCCTGTTCTCATGGTAGTGAATAAGTCTCATGGGATCTGAT


 


[(16403083, 16403084, 'T', 'G')] ['C', 'G', 'T', 'T', 'T', 'C', 'A', 'G', 'A', 'G', 'A', 'C', 'C', 'A', 'G', 'C', 'T', 'T', 'T', 'G', 'A', 'A', 'G', 'C', 'A', 'C', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'G', 'T', 'A', 'G', 'T', 'A', 'T', 'G', 'T', 'G', 'C', 'A', 'C', 'G', 'T', 'G', 'G', 'A', 'T', 'A', 'T', 'T', 'T', 'G', 'G', 'A', 'G', 'C', 'G', 'C', 'T', 'C', 'T', 'G', 'A', 'G', 'G', 'C', 'C', 'T', 'A', 'C', 'G', 'G', 'T', 'G', 'A', 'A', 'A', 'A', 'A', 'G', 'C', 'A', 'A', 'A', 'T', 'A', 'T', 'C', 'T', 'T', 'C', 'C', 'C', 'A', 'T', 'A'] 16403006


 


77
CGTTTCAGAGACCAGCTTTGAAGCACTCTTTTTGTAGTATGTGCACGTGGATATTTGGAGCGCTCTGAGGCCTACGGGGAAAAAGCAAATATCTTCCCATA


 


[(16647754, 16647755, 'G', 'C')] ['A', 'C', 'A', 'G', 'A', 'G', 'G', 'T', 'G', 'G', 'A', 'T', 'C', 'T', 'T', 'T', 'C', 'T', 'T', 'T', 'T', 'G', 'A', 'T', 'A', 'G', 'A', 'G', 'C', 'A', 'G', 'T', 'T', 'G', 'T', 'G', 'A', 'A', 'A', 'A', 'A', 'C', 'A', 'C', 'T', 'T', 'T', 'T', 'T', 'G', 'T', 'T', 'G', 'A', 'T', 'T', 'A', 'T', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'C', 'A', 'T', 'T', 'T', 'G', 'G', 'A', 'T', 'A', 'G', 'A', 'T', 'T', 'T', 'G', 'A', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'G', 'T', 'T', 'G', 'G', 'A', 'A', 'A', 'C', 'G'] 16647729


 


25
ACAGAGGTGGATCTTTCTTTTGATACAGCAGTTGTGAAAAACACTTTTTGTTGATTATGCAAGTGGACATTTGGATAGATTTGAAGATTTCGTTGGAAACG


 


[(17219833, 17219834, 'T', 'A')] ['C', 'A', 'T', 'T', 'T', 'G', 'G', 'A', 'T', 'A', 'G', 'A', 'T', 'A', 'T', 'G', 'A', 'A', 'G', 'A', 'T', 'T', 'T', 'C', 'G', 'T', 'T', 'G', 'G', 'A', 'A', 'A', 'C', 'G', 'G', 'G', 'A', 'A', 'T', 'A', 'T', 'C', 'T', 'T', 'C', 'A', 'T', 'A', 'T', 'C', 'A', 'A', 'A', 'T', 'C', 'T', 'A', 'G', 'A', 'C', 'A', 'G', 'A', 'A', 'G', 'C', 'A', 'T', 'T', 'C', 'T', 'C', 'A', 'G', 'A', 'A', 'A', 'C', 'G', 'T', 'C', 'T', 'T', 'T', 'G', 'T', 'G', 'A', 'T', 'G', 'T', 'T', 'T', 'G', 'C', 'A', 'T', 'T', 'C', 'A', 'A'] 17219793


 


40
CATTTGGATAGATATGAAGATTTCGTTGGAAACGGGAATAACTTCATATCAAATCTAGACAGAAGCATTCTCAGAAACGTCTTTGTGATGTTTGCATTCAA


 


[(17434557, 17434558, 'T', 'A'), (17434610, 17434611, 'G', 'T')] ['T', 'T', 'C', 'T', 'A', 'T', 'T', 'G', 'A', 'T', 'A', 'G', 'A', 'G', 'C', 'A', 'G', 'T', 'T', 'T', 'T', 'G', 'A', 'A', 'A', 'C', 'A', 'C', 'T', 'C', 'T', 'T', 'T', 'T', 'T', 'G', 'T', 'G', 'G', 'A', 'A', 'T', 'C', 'T', 'G', 'C', 'A', 'A', 'G', 'T', 'G', 'G', 'A', 'T', 'A', 'T', 'T', 'T', 'G', 'G', 'A', 'T', 'A', 'G', 'C', 'T', 'T', 'G', 'G', 'A', 'G', 'G', 'T', 'T', 'T', 'T', 'C', 'G', 'T', 'T', 'G', 'G', 'A', 'A', 'G', 'A', 'G', 'G', 'G', 'A', 'A', 'T', 'T', 'C', 'A', 'A', 'A', 'T', 'A', 'A', 'A'] 17434519


 


38
Expected ref 'T' at position 38, but found 'G'
91
Expected ref 'G' at position 91, but found 'T'
TTCTATTGATAGAGCAGTTTTGAAACACTCTTTTTGTGGAATCTGCAAGTGGATATTTGGATAGCTTGGAGGTTTTCGTTGGAAGAGGGAATTCAAATAAA


In [42]:
len(missing_files)

100

In [12]:
grouped_df

Unnamed: 0,chr,Chip_Seq_start,Chip_Seq_end,varinat_start,variant_end,ref_neucleotide,alternative_neucleotide,reference_seq,alt_seq,mutated_sequence
0,chr1,16732666,16732766,[16732746],[16732747],[G],[T],GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTTT...,[GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTT...,GGAAGAAGAAAAAAACCTGTGGCAATTGGAACATAGTTTATTCTTT...
1,chr1,54707858,54707958,[54707863],[54707864],[C],[T],TGCATCCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...,[TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCAC...,TGCATTCTCAGATGGATCCTCAGATCCTTGCCCTCCCACTGTCACC...
2,chr1,111040309,111040410,[111040379],[111040380],[C],[T],TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACC...,[TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGAC...,TTGGGAGGCCAAGGCGGGCGGATCACGAGGTCAGGAGATCGAGACC...
3,chr1,121657530,121657631,[121657570],[121657571],[G],[C],AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTGTGTTT...,[AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTCTGTT...,AGAGAGGTCCAAATATCCACCTGCAGAGTCTACAAAAAGTCTGTTT...
4,chr1,123086175,123086276,"[123086218, 123086260, 123086246]","[123086219, 123086261, 123086247]","[T, A]","[G, C, A]",ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...,[ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTG...,ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGGGG...
...,...,...,...,...,...,...,...,...,...,...
79,chr9,106762858,106762959,[106762872],[106762873],[T],[C],TTAGGAGAAAGAAGTAGGGGGAAGCTTAGTACTTGGAGGGAGCAGT...,[TTAGGAGAAAGAAGCAGGGGGAAGCTTAGTACTTGGAGGGAGCAG...,TTAGGAGAAAGAAGCAGGGGGAAGCTTAGTACTTGGAGGGAGCAGT...
80,chr9,130633198,130633299,[130633242],[130633243],[T],[C],CTTGCATGCTCTGAAATTGTATAGTTGTGTTTCTTCCCTTTTTTTC...,[CTTGCATGCTCTGAAATTGTATAGTTGTGTTTCTTCCCTTTTTTC...,CTTGCATGCTCTGAAATTGTATAGTTGTGTTTCTTCCCTTTTTTCC...
81,chrX,1348653,1348754,[1348693],[1348694],[T],[C],CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTTTCTTT...,[CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCTT...,CTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCTTT...
82,chrX,1570146,1570247,[1570207],[1570208],[C],[A],TGCAGTGAGCCAAGTTTGCACTGATGCACTCCAGCCTGGGTGACAG...,[TGCAGTGAGCCAAGTTTGCACTGATGCACTCCAGCCTGGGTGACA...,TGCAGTGAGCCAAGTTTGCACTGATGCACTCCAGCCTGGGTGACAG...


In [13]:
grouped_df.iloc[4]

chr                                                                     chr1
Chip_Seq_start                                                     123086175
Chip_Seq_end                                                       123086276
varinat_start                              [123086218, 123086260, 123086246]
variant_end                                [123086219, 123086261, 123086247]
ref_neucleotide                                                       [T, A]
alternative_neucleotide                                            [G, C, A]
reference_seq              ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGG...
alt_seq                    [ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTG...
mutated_sequence           ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGGGG...
Name: 4, dtype: object

In [14]:
grouped_df.iloc[4]['alt_seq']

['ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAACAACTTCGTAGAAAAA',
 'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATGGTAGAAAAGGAAATAACTTCGTAGAAAAA',
 'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGAGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA']

In [15]:
grouped_df.iloc[4]['mutated_sequence']

'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGGGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA'

In [16]:
grouped_df.iloc[4]['reference_seq']

'ACAGAGAGCAGACTTGAAACACTCTTTTTGTGGAATTTGCAAGTGGAGATTTCAGCCGCTTTGAGGTCAATAGTAGAAAAGGAAATAACTTCGTAGAAAAA'