In [2]:
import pysam
import pandas as pd
#import pyBigWig
import pybedtools

In [10]:
vcf_file_path = "/data/private/pdutta/Collab_data/Oliver_data/APOE_TREM2/APOE_regions.vcf"
reference_file_path = "/data/projects/Resources/HumanReferenceGenome/chr19.fa"
output_path = "/data/private/pdutta/Collab_data/Oliver_data/DNABERT_data/APOE"

In [4]:
vcf_file = pysam.VariantFile(vcf_file_path)
reference_fasta = pysam.FastaFile(reference_file_path)

In [5]:
for rec in vcf_file.fetch():
    # You can access all data about the variant like this:
    print('Chromosome:', rec.chrom)
    print('Position:', rec.pos)
    print('Reference allele:', rec.ref)
    print('Alternative alleles:', rec.alts)

Chromosome: 19
Position: 44905879
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44905881
Reference allele: G
Alternative alleles: ('T',)
Chromosome: 19
Position: 44905886
Reference allele: T
Alternative alleles: ('C',)
Chromosome: 19
Position: 44905910
Reference allele: C
Alternative alleles: ('G',)
Chromosome: 19
Position: 44905923
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44906639
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44906646
Reference allele: T
Alternative alleles: ('C',)
Chromosome: 19
Position: 44907785
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44907788
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44907934
Reference allele: A
Alternative alleles: ('G',)


In [6]:
context = 50
data = []

for rec in vcf_file.fetch():
    # Get the sequence context around the variant
    seq_context = reference_fasta.fetch("chr"+rec.chrom, rec.pos - 1 - context, rec.pos + context)

    # Replace the reference allele with the alternative allele
    alt_seq_context = seq_context[:context] + str(rec.alts[0]) + seq_context[context + len(rec.ref):]

    print('Reference Allele and alterna', rec.ref , rec.alts)
    print('Position:', rec.pos)
    print("ID", rec.id)
    data.append([rec.chrom, rec.pos, rec.id, "Reference", seq_context ])
    data.append([rec.chrom, rec.pos, rec.id, "Alternative", alt_seq_context])
    print(seq_context)
    print(alt_seq_context)

Reference Allele and alterna G ('A',)
Position: 44905879
ID rs373985746
TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTC
TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGAGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTC
Reference Allele and alterna G ('T',)
Position: 44905881
ID None
CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTC
CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGTCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTC
Reference Allele and alterna T ('C',)
Position: 44905886
ID None
GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGAG
GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCCAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGAG
Reference Allele and alterna C ('G',)
Position: 44905910
ID rs440446
GGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGAGAGCTACTCGGGGTCGGGCTTGGGG
GGCACGGGGATGAGC

In [7]:
# Create a dataframe from your data
df_APOE = pd.DataFrame(data, columns=["CHROMOSOME", "POS",'ID',"TAG", 'SEQUENCE', ])
df_APOE

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE
0,19,44905879,rs373985746,Reference,TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGC...
1,19,44905879,rs373985746,Alternative,TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGC...
2,19,44905881,,Reference,CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTC...
3,19,44905881,,Alternative,CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTC...
4,19,44905886,,Reference,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...
5,19,44905886,,Alternative,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...
6,19,44905910,rs440446,Reference,GGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTG...
7,19,44905910,rs440446,Alternative,GGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTG...
8,19,44905923,,Reference,GCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCC...
9,19,44905923,,Alternative,GCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCC...


In [8]:
def seq2kmer(seq, k):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [9]:
df_APOE['6-mer'] = df_APOE['SEQUENCE'].apply(seq2kmer, args=(6,))
df_APOE

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE,6-mer
0,19,44905879,rs373985746,Reference,TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGC...,TCCCCA CCCCAG CCCAGG CCAGGA CAGGAG AGGAGC GGAG...
1,19,44905879,rs373985746,Alternative,TCCCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGC...,TCCCCA CCCCAG CCCAGG CCAGGA CAGGAG AGGAGC GGAG...
2,19,44905881,,Reference,CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTC...,CCCAGG CCAGGA CAGGAG AGGAGC GGAGCC GAGCCG AGCC...
3,19,44905881,,Alternative,CCCAGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTC...,CCCAGG CCAGGA CAGGAG AGGAGC GGAGCC GAGCCG AGCC...
4,19,44905886,,Reference,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...,GAGCCG AGCCGG GCCGGT CCGGTG CGGTGA GGTGAG GTGA...
5,19,44905886,,Alternative,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...,GAGCCG AGCCGG GCCGGT CCGGTG CGGTGA GGTGAG GTGA...
6,19,44905910,rs440446,Reference,GGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTG...,GGCACG GCACGG CACGGG ACGGGG CGGGGA GGGGAT GGGA...
7,19,44905910,rs440446,Alternative,GGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTG...,GGCACG GCACGG CACGGG ACGGGG CGGGGA GGGGAT GGGA...
8,19,44905923,,Reference,GCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCC...,GCTCAG CTCAGG TCAGGG CAGGGG AGGGGC GGGGCC GGGC...
9,19,44905923,,Alternative,GCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCC...,GCTCAG CTCAGG TCAGGG CAGGGG AGGGGC GGGGCC GGGC...


In [11]:
df_APOE.to_csv(output_path+"/all_data.tsv", sep="\t", index=False)

In [12]:
df_predict = pd.DataFrame()
df_predict['Sequence'] = df_APOE['6-mer']
df_predict['Label'] = 0

In [13]:
df_predict.to_csv(output_path+"/dev.tsv", sep="\t", index=False)