In [24]:
import io, os
import pysam
import pandas as pd
#import pyBigWig
import pybedtools
from tqdm import tqdm

In [23]:
!conda install -c conda-forge tqdm --yes

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /shared/software/anaconda/anaconda3/envs/sequence_processing

  added / updated specs:
    - tqdm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    colorama-0.4.6             |     pyhd8ed1ab_0          25 KB  conda-forge
    tqdm-4.65.0                |     pyhd8ed1ab_1          86 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         111 KB

The following NEW packages will be INSTALLED:

  colorama           conda-forge/noarch::colorama-0.4.6-pyhd8ed1ab_0 
  tqdm               conda-forge/noarch::tqdm-4.65.0-pyhd8ed1ab_1 



Downloading and Extracting Packages
colorama-0.4.6       | 25 KB     |                                       |   0

In [26]:
vcf_file_path = "/data/projects/VCF_files/DBSNP/00-All.vcf"
#reference_folder_path = "/data/projects/Resources/HumanReferenceGenome/"

## Covert VCF file to dataframe to visualize

In [None]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [None]:
read_vcf(vcf_file_path)

## Processing the VCF file using PySAM 

In [25]:
vcf_file = pysam.VariantFile(vcf_file_path)
#reference_fasta = pysam.FastaFile(reference_file_path)

In [None]:
total = sum(1 for _ in vcf_file.fetch())
total

In [15]:
for variant in vcf_file:
    print('Variant:', variant)
    print('Chromosome:', variant.chrom)
    print('Position:', variant.pos)
    print('ID:', variant.id)
    print('Reference base(s):', variant.ref)
    print('Alternative base(s):', variant.alts)
    print('Quality:', variant.qual)
    print('Filter:', variant.filter.keys())
    print('Info keys:', variant.info.keys())
    print('Info values:', variant.info.values())
    print('Format:', variant.format.keys())
    print('Samples:', variant.samples.keys())
    input()

Variant: 1	10055	rs768019142	T	TA	.	.	RS=768019142;RSPOS=10055;dbSNPBuildID=144;SSR=0;SAO=0;VP=0x050000020005000002000200;GENEINFO=DDX11L1:100287102;WGT=1;VC=DIV;R5;ASP

Chromosome: 1
Position: 10055
ID: rs768019142
Reference base(s): T
Alternative base(s): ('TA',)
Quality: None
Filter: []
Info keys: ['RS', 'RSPOS', 'dbSNPBuildID', 'SSR', 'SAO', 'VP', 'GENEINFO', 'WGT', 'VC', 'R5', 'ASP']
Info values: [768019142, 10055, 144, 0, 0, '0x050000020005000002000200', 'DDX11L1:100287102', 1, 'DIV', True, True]
Format: []
Samples: []


 


Variant: 1	10055	rs892501864	T	A	.	.	RS=892501864;RSPOS=10055;dbSNPBuildID=150;SSR=0;SAO=0;VP=0x050000020005000002000100;GENEINFO=DDX11L1:100287102;WGT=1;VC=SNV;R5;ASP

Chromosome: 1
Position: 10055
ID: rs892501864
Reference base(s): T
Alternative base(s): ('A',)
Quality: None
Filter: []
Info keys: ['RS', 'RSPOS', 'dbSNPBuildID', 'SSR', 'SAO', 'VP', 'GENEINFO', 'WGT', 'VC', 'R5', 'ASP']
Info values: [892501864, 10055, 150, 0, 0, '0x050000020005000002000100', 'DDX11L1:100287102', 1, 'SNV', True, True]
Format: []
Samples: []


KeyboardInterrupt: Interrupted by user

In [None]:
for rec in vcf_file.fetch():
    # You can access all data about the variant like this:
    print('Chromosome:', rec.chrom)
    print('Position:', rec.pos)
    print('Reference allele:', rec.ref)
    print('Alternative alleles:', rec.alts, rec.info)
    input()

In [20]:
context = 2
data = []

for rec in vcf_file.fetch():
    # Get the sequence context around the variant
    reference_fasta = pysam.FastaFile(reference_f_path+"chr"+str(rec.chrom)+".fa")
    seq_context = reference_fasta.fetch("chr"+rec.chrom, rec.pos - 1 - context, rec.pos + context)

    # Replace the reference allele with the alternative allele
    alt_seq_context = seq_context[:context] + str(rec.alts[0]) + seq_context[context + len(rec.ref):]

    print('Reference Allele and alterna', rec.ref , rec.alts)
    print('Position:', rec.pos)
    print("ID", rec.id)
    data.append([rec.chrom, rec.pos, rec.id, seq_context, alt_seq_context])

Reference Allele and alterna TA ('T',)
Position: 10019
ID rs775809821
cctaa
ccTa


 


Reference Allele and alterna A ('C',)
Position: 10039
ID rs978760828
taacc
taCcc


 


Reference Allele and alterna T ('A',)
Position: 10043
ID rs1008829651
cctaa
ccAaa


 


Reference Allele and alterna A ('G',)
Position: 10051
ID rs1052373574
taacc
taGcc


 


Reference Allele and alterna A ('AC',)
Position: 10051
ID rs1326880612
taacc
taACcc


 


Reference Allele and alterna T ('TA',)
Position: 10055
ID rs768019142
cctaa
ccTAaa


KeyboardInterrupt: Interrupted by user

In [None]:
# Create a dataframe from your data
df_APOE = pd.DataFrame(data, columns=["CHROMOSOME", "POS",'ID', 'REFERENCE_SEQUENCE', 'ALTERNATIVE_SEQUENCE'])
df_APOE

In [None]:
data= []
for rec in vcf.fetch():
    for sample in rec.samples:
        sample_data = rec.samples[sample]
        data.append([rec.chrom, rec.pos, rec.id, rec.ref, ','.join(rec.alts), rec.qual,
                     ';'.join(rec.filter.keys()), dict(rec.info), sample, sample_data['GT']])

# Create a pandas DataFrame from the data
df = pd.DataFrame(data, columns=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'SAMPLE', 'GT'])


In [None]:
df

In [None]:
df[df["SAMPLE"]=="A-ACT-AC000007-BL-UPN-6888"]

In [None]:
fasta.fetch('chr19', 10, 20)

In [None]:
def get_flanking_sequence(fasta, chrom, position, flanking_bp):
    start = max(0, position - flanking_bp - 1)
    end = position + flanking_bp
    seq = fasta.fetch(chrom, start, end)
    return seq

In [None]:
# replace with your files
vcf_filename = file_path
reference_fasta_path = "/data/projects/Resources/HumanReferenceGenome/chr19.fa"

In [None]:
vcf = pysam.VariantFile(vcf_filename)
fasta = pysam.FastaFile(reference_fasta_path)
flanking_bp = 20

for rec in vcf:
    ref_seq = rec.ref
    alt_seq = rec.alts[0] # assuming there is always at least one alternate allele
    seq = get_flanking_sequence(fasta, "chr"+rec.chrom, rec.pos, flanking_bp)

    print(f'Position: {rec.pos}')
    print(f'Reference allele: {ref_seq}')
    print(f'Alternate allele: {alt_seq}')
    print(f'Flanking sequence: {seq}')
    input()

In [None]:
vcf = pysam.VariantFile(vcf_filename)
fasta = pysam.FastaFile(reference_fasta_path)
flanking_bp = 20

for rec in vcf:
    for sample in rec.samples:
        genotype = rec.samples[sample]['GT']
        if genotype is None:  # genotype information not available for this sample
            continue
        alleles = [rec.ref] + list(rec.alts) if rec.alts else [rec.ref]
        sample_alleles = [alleles[gt] for gt in genotype if gt is not None]

        print(f'Sample: {sample}')
        print(f'Genotype: {sample_alleles}')

        for allele in sample_alleles:
            seq = get_flanking_sequence(fasta, "chr"rec.chrom, rec.pos, flanking_bp)
            print(f'Position: {rec.pos}')
            print(f'Allele: {allele}')
            print(f'Flanking sequence: {seq}')
            input()

In [None]:
df_test.columns.to_list()