In [1]:
import pysam
import pandas as pd
#import pyBigWig
import pybedtools

In [2]:
vcf_file_path = "/data/private/pdutta/Collab_data/Oliver_data/APOE_TREM2/TREM2_regions.vcf"
reference_file_path = "/data/projects/Resources/HumanReferenceGenome/chr6.fa"
output_path = "/data/private/pdutta/Collab_data/Oliver_data/DNABERT_data/TREM2"

In [3]:
vcf_file = pysam.VariantFile(vcf_file_path)
reference_fasta = pysam.FastaFile(reference_file_path)

In [4]:
for rec in vcf_file.fetch():
    # You can access all data about the variant like this:
    print('Chromosome:', rec.chrom)
    print('Position:', rec.pos)
    print('Reference allele:', rec.ref)
    print('Alternative alleles:', rec.alts)

Chromosome: 6
Position: 41158747
Reference allele: C
Alternative alleles: ('T',)
Chromosome: 6
Position: 41158767
Reference allele: C
Alternative alleles: ('T',)
Chromosome: 6
Position: 41158768
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 6
Position: 41158875
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 6
Position: 41158881
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 6
Position: 41158904
Reference allele: A
Alternative alleles: ('C',)
Chromosome: 6
Position: 41159042
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 6
Position: 41159052
Reference allele: C
Alternative alleles: ('T',)
Chromosome: 6
Position: 41159063
Reference allele: G
Alternative alleles: ('C',)
Chromosome: 6
Position: 41159068
Reference allele: T
Alternative alleles: ('G',)
Chromosome: 6
Position: 41159805
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 6
Position: 41159823
Reference allele: C
Alternative alleles: ('T',)
Chromosome: 6
Position: 4115

In [5]:
context = 50
data = []

for rec in vcf_file.fetch():
    # Get the sequence context around the variant
    seq_context = reference_fasta.fetch("chr"+rec.chrom, rec.pos - 1 - context, rec.pos + context)

    # Replace the reference allele with the alternative allele
    alt_seq_context = seq_context[:context] + str(rec.alts[0]) + seq_context[context + len(rec.ref):]

    print('Reference Allele and alterna', rec.ref , rec.alts)
    print('Position:', rec.pos)
    print("ID", rec.id)
    data.append([rec.chrom, rec.pos, rec.id, "Reference", seq_context ])
    data.append([rec.chrom, rec.pos, rec.id, "Alternative", alt_seq_context])
    print(seq_context)
    print(alt_seq_context)

Reference Allele and alterna C ('T',)
Position: 41158747
ID None
GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCTTCACGTGTCTCTCAGCCCTGCAATAGTCCAAGGA
GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGCTTTTTCTCCCATCATCTTCCTTCACGTGTCTCTCAGCCCTGCAATAGTCCAAGGA
Reference Allele and alterna C ('T',)
Position: 41158767
ID rs199795809
GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCTTCACGTGTCTCTCAGCCCTGCAATAGTCCAAGGACTCATGTGGCCCCTCTCGGC
GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCTTCATGTGTCTCTCAGCCCTGCAATAGTCCAAGGACTCATGTGGCCCCTCTCGGC
Reference Allele and alterna G ('A',)
Position: 41158768
ID rs368921728
GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCTTCACGTGTCTCTCAGCCCTGCAATAGTCCAAGGACTCATGTGGCCCCTCTCGGCA
GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCTTCACATGTCTCTCAGCCCTGCAATAGTCCAAGGACTCATGTGGCCCCTCTCGGCA
Reference Allele and alterna G ('A',)
Position: 41158875
ID None
CCCATCCCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCCACCTGGCAGAGTTTGGAGCTGATACCCTGGGTCATGGCCACAGTCCAGTTCACTG
CCCATCCCTGCC

In [6]:
# Create a dataframe from your data
df_TREM2 = pd.DataFrame(data, columns=["CHROMOSOME", "POS",'ID',"TAG", 'SEQUENCE', ])
df_TREM2

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE
0,6,41158747,,Reference,GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGC...
1,6,41158747,,Alternative,GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGC...
2,6,41158767,rs199795809,Reference,GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCC...
3,6,41158767,rs199795809,Alternative,GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCC...
4,6,41158768,rs368921728,Reference,GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCT...
5,6,41158768,rs368921728,Alternative,GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCT...
6,6,41158875,,Reference,CCCATCCCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCC...
7,6,41158875,,Alternative,CCCATCCCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCC...
8,6,41158881,rs138355759,Reference,CCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCCACCTGG...
9,6,41158881,rs138355759,Alternative,CCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCCACCTGG...


In [7]:
def seq2kmer(seq, k):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [8]:
df_TREM2['6-mer'] = df_TREM2['SEQUENCE'].apply(seq2kmer, args=(6,))
df_TREM2

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE,6-mer
0,6,41158747,,Reference,GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGC...,GCAAGT CAAGTA AAGTAT AGTATG GTATGC TATGCA ATGC...
1,6,41158747,,Alternative,GCAAGTATGCAGGCTGGGCTGGTCCCTGGTGGGACTTCTCCTGGGC...,GCAAGT CAAGTA AAGTAT AGTATG GTATGC TATGCA ATGC...
2,6,41158767,rs199795809,Reference,GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCC...,GGTCCC GTCCCT TCCCTG CCCTGG CCTGGT CTGGTG TGGT...
3,6,41158767,rs199795809,Alternative,GGTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCC...,GGTCCC GTCCCT TCCCTG CCCTGG CCTGGT CTGGTG TGGT...
4,6,41158768,rs368921728,Reference,GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCT...,GTCCCT TCCCTG CCCTGG CCTGGT CTGGTG TGGTGG GGTG...
5,6,41158768,rs368921728,Alternative,GTCCCTGGTGGGACTTCTCCTGGGCTTTTCCTCCCATCATCTTCCT...,GTCCCT TCCCTG CCCTGG CCTGGT CTGGTG TGGTGG GGTG...
6,6,41158875,,Reference,CCCATCCCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCC...,CCCATC CCATCC CATCCC ATCCCT TCCCTG CCCTGC CCTG...
7,6,41158875,,Alternative,CCCATCCCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCC...,CCCATC CCATCC CATCCC ATCCCT TCCCTG CCCTGC CCTG...
8,6,41158881,rs138355759,Reference,CCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCCACCTGG...,CCTGCC CTGCCC TGCCCA GCCCAG CCCAGT CCAGTC CAGT...
9,6,41158881,rs138355759,Alternative,CCTGCCCAGTCCACCCTTGATGGCTGTGCTCTCCAAGCCCACCTGG...,CCTGCC CTGCCC TGCCCA GCCCAG CCCAGT CCAGTC CAGT...


In [9]:
df_TREM2.to_csv(output_path+"/all_data.tsv", sep="\t", index=False)

In [10]:
df_predict = pd.DataFrame()
df_predict['Sequence'] = df_TREM2['6-mer']
df_predict['Label'] = 0

In [11]:
df_predict.to_csv(output_path+"/dev.tsv", sep="\t", index=False)