In [40]:
import pysam
import pandas as pd
#import pyBigWig
import pybedtools

In [48]:
vcf_file_path = "/data/private/pdutta/Collab_data/Oliver_data/APOE_TREM2/APOE_regions.vcf"
reference_file_path = "/data/projects/Resources/HumanReferenceGenome/chr19.fa"
output_path = "/data/private/pdutta/Collab_data/Oliver_data/DNABERT_data/APOE_Core_Prom_90BP"

In [42]:
vcf_file = pysam.VariantFile(vcf_file_path)
reference_fasta = pysam.FastaFile(reference_file_path)

In [43]:
for rec in vcf_file.fetch():
    # You can access all data about the variant like this:
    print('Chromosome:', rec.chrom)
    print('Position:', rec.pos)
    print('Reference allele:', rec.ref)
    print('Alternative alleles:', rec.alts)

Chromosome: 19
Position: 44905879
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44905881
Reference allele: G
Alternative alleles: ('T',)
Chromosome: 19
Position: 44905886
Reference allele: T
Alternative alleles: ('C',)
Chromosome: 19
Position: 44905910
Reference allele: C
Alternative alleles: ('G',)
Chromosome: 19
Position: 44905923
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44906639
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44906646
Reference allele: T
Alternative alleles: ('C',)
Chromosome: 19
Position: 44907785
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44907788
Reference allele: G
Alternative alleles: ('A',)
Chromosome: 19
Position: 44907934
Reference allele: A
Alternative alleles: ('G',)


In [44]:
context = 45
data = []

for rec in vcf_file.fetch():
    # Get the sequence context around the variant
    seq_context = reference_fasta.fetch("chr"+rec.chrom, rec.pos - 1 - context, rec.pos + context)

    # Replace the reference allele with the alternative allele
    alt_seq_context = seq_context[:context] + str(rec.alts[0]) + seq_context[context + len(rec.ref):]

    print('Reference Allele and alterna', rec.ref , rec.alts)
    print('Position:', rec.pos)
    print("ID", rec.id)
    data.append([rec.chrom, rec.pos, rec.id, "Reference", seq_context ])
    data.append([rec.chrom, rec.pos, rec.id, "Alternative", alt_seq_context])
    print(seq_context)
    print(alt_seq_context)

Reference Allele and alterna G ('A',)
Position: 44905879
ID rs373985746
AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGG
AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGAGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGG
Reference Allele and alterna G ('T',)
Position: 44905881
ID None
GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTA
GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGTCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTA
Reference Allele and alterna T ('C',)
Position: 44905886
ID None
GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTC
GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCCAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTC
Reference Allele and alterna C ('G',)
Position: 44905910
ID rs440446
GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAGGTAGTCTCAGGAGAGCTACTCGGGGTCGGGCT
GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAAGCCCTGGCCTCCAGGTAGTCTCAGGAGAGCTACTCGGGGT

In [45]:
# Create a dataframe from your data
df_APOE = pd.DataFrame(data, columns=["CHROMOSOME", "POS",'ID',"TAG", 'SEQUENCE', ])
df_APOE

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE
0,19,44905879,rs373985746,Reference,AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGG...
1,19,44905879,rs373985746,Alternative,AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGA...
2,19,44905881,,Reference,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...
3,19,44905881,,Alternative,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGT...
4,19,44905886,,Reference,GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCT...
5,19,44905886,,Alternative,GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCC...
6,19,44905910,rs440446,Reference,GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAAC...
7,19,44905910,rs440446,Alternative,GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAAG...
8,19,44905923,,Reference,GGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAG...
9,19,44905923,,Alternative,GGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAA...


In [46]:
def seq2kmer(seq, k):
    """
    Convert original sequence to kmers
    
    Arguments:
    seq -- str, original sequence.
    k -- int, kmer of length k specified.
    
    Returns:
    kmers -- str, kmers separated by space
    """
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    kmers = " ".join(kmer)
    return kmers

In [47]:
df_APOE['6-mer'] = df_APOE['SEQUENCE'].apply(seq2kmer, args=(6,))
df_APOE

Unnamed: 0,CHROMOSOME,POS,ID,TAG,SEQUENCE,6-mer
0,19,44905879,rs373985746,Reference,AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGG...,AGGAGC GGAGCC GAGCCG AGCCGG GCCGGT CCGGTG CGGT...
1,19,44905879,rs373985746,Alternative,AGGAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGA...,AGGAGC GGAGCC GAGCCG AGCCGG GCCGGT CCGGTG CGGT...
2,19,44905881,,Reference,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGG...,GAGCCG AGCCGG GCCGGT CCGGTG CGGTGA GGTGAG GTGA...
3,19,44905881,,Alternative,GAGCCGGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGT...,GAGCCG AGCCGG GCCGGT CCGGTG CGGTGA GGTGAG GTGA...
4,19,44905886,,Reference,GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCT...,GGTGAG GTGAGA TGAGAA GAGAAG AGAAGC GAAGCG AAGC...
5,19,44905886,,Alternative,GGTGAGAAGCGCAGTCGGGGGCACGGGGATGAGCTCAGGGGCCTCC...,GGTGAG GTGAGA TGAGAA GAGAAG AGAAGC GAAGCG AAGC...
6,19,44905910,rs440446,Reference,GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAAC...,GGGGAT GGGATG GGATGA GATGAG ATGAGC TGAGCT GAGC...
7,19,44905910,rs440446,Alternative,GGGGATGAGCTCAGGGGCCTCTAGAAAGAGCTGGGACCCTGGGAAG...,GGGGAT GGGATG GGATGA GATGAG ATGAGC TGAGCT GAGC...
8,19,44905923,,Reference,GGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAG...,GGGGCC GGGCCT GGCCTC GCCTCT CCTCTA CTCTAG TCTA...
9,19,44905923,,Alternative,GGGGCCTCTAGAAAGAGCTGGGACCCTGGGAACCCCTGGCCTCCAA...,GGGGCC GGGCCT GGCCTC GCCTCT CCTCTA CTCTAG TCTA...


In [49]:
df_APOE.to_csv(output_path+"/all_data.tsv", sep="\t", index=False)

In [50]:
values = [0] * (len(df_APOE) // 2) + [1] * (len(df_APOE) // 2)

# If the DataFrame has an odd number of rows, add one more 0 or 1 to make the length match
if len(df_APOE) % 2:
    values += [np.random.choice([0, 1])]

In [51]:
df_predict = pd.DataFrame()
df_predict['Sequence'] = df_APOE['6-mer']
df_predict['Label'] = values

In [52]:
df_predict.to_csv(output_path+"/dev.tsv", sep="\t", index=False)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]