# Allele frequencies in 1000 Genomes Phase 3

ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/


In [1]:
import os
import gzip
import csv
import vcf

In [2]:
# Data downloaded from ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502/
vcf_files = sorted(x for x in os.listdir('download') if x.endswith('a.20130502.genotypes.vcf.gz'))
len(vcf_files)

24

In [3]:
def get_majar_af(aafs):
    """
    Returns the major allele frequency from
    the list of alternate allele frequencies.
    """
    allele_freqs = aafs + [1 - sum(aafs)]
    return max(allele_freqs)

In [4]:
major_af_max = 0.95

In [None]:
write_file = gzip.open('data/common-SNPs.tsv.gz', 'wt')
writer = csv.writer(write_file, delimiter='\t')
writer.writerow(('chromosome', 'position', 'major_allele_frequency', 'rsid'))
for vcf_file in vcf_files:
    print('Beginning', vcf_file)
    path = os.path.join('download', vcf_file)
    
    for r in vcf.Reader(filename=path):
        # Quality control
        if r.FILTER:
            continue

        # Exclude non-SNPs
        if not r.is_snp:
            continue

        # Major allele frequency check
        major_af = get_majar_af(r.INFO['AF'])
        if major_af > major_af_max:
            continue
        
        # Write SNP to tsv
        row = r.CHROM, r.POS, round(major_af, 6), r.ID
        writer.writerow(row)

write_file.close()

In [None]:
vcf_files