In [1]:
import numpy as np
import pandas as pd
import gzip

In [None]:
# download data if needed
chr_num = 2 # which chromosome's data to download
!wget -P data/ http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom_num}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz

In [2]:
# open the VCF file
vcf_file = gzip.open('data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz', 'rt')
print('File opened!')

File opened!


In [3]:
variant_count = 1000000 # number of variants to use

# extract the samples and variants
samples = []
variants = []

for count, line in enumerate(vcf_file):
    if count % 10000 == 0:
        print(f'Lines processed: {count}')
        
    if line.startswith('#CHROM'): # get sample ids
        samples = line.strip().split()[9:]
    
    elif not line.startswith('#'): # get genotypes at each variant
        variants.append(line.strip().split())

    if len(variants) == variant_count:
        break

vcf_file.close()
print('Done extracting!')

Lines processed: 0
Lines processed: 10000
Lines processed: 20000
Lines processed: 30000
Lines processed: 40000
Lines processed: 50000
Lines processed: 60000
Lines processed: 70000
Lines processed: 80000
Lines processed: 90000
Lines processed: 100000
Lines processed: 110000
Lines processed: 120000
Lines processed: 130000
Lines processed: 140000
Lines processed: 150000
Lines processed: 160000
Lines processed: 170000
Lines processed: 180000
Lines processed: 190000
Lines processed: 200000
Lines processed: 210000
Lines processed: 220000
Lines processed: 230000
Lines processed: 240000
Lines processed: 250000
Lines processed: 260000
Lines processed: 270000
Lines processed: 280000
Lines processed: 290000
Lines processed: 300000
Lines processed: 310000
Lines processed: 320000
Lines processed: 330000
Lines processed: 340000
Lines processed: 350000
Lines processed: 360000
Lines processed: 370000
Lines processed: 380000
Lines processed: 390000
Lines processed: 400000
Lines processed: 410000
Lines 

In [4]:
"""
Create matrix containing a numeric representation of genotypes for each individual at each variant site
Genotype of 0: Homozygous reference genotype, both alleles at the variant site are identical to the reference allele
Genotype of 1: Heterozygous genotype, only one allele at the variant site is identical to the reference allele
Genotype of 2: Homozygous alternate genotype, both alleles at the variant site are different from the reference allele
"""
genotypes = np.zeros((len(samples), len(variants)), dtype=np.float32)
for i, variant in enumerate(variants):
    for j in range(len(samples)):
        call = variant[9+j].split(':')[0]
        if call == '.|.' or call == '.|. .|.': # missing or invalid genotype at variant site
            print(f'Skipping sample {j} for variant {i} due to missing call: {call}')
            genotypes[j, i] = np.nan
        else:
            alleles = call.split('|')
            genotypes[j, i] = sum(map(int, alleles))

# filter out samples with missing/invalid genotypes at any variant site
genotypes = genotypes[~np.isnan(genotypes).any(axis=1), :]

print('Matrix created!')

Matrix created!


In [6]:
chr_num = 1

# dimensions of genotypes matrix
print('Genotypes matrix shape:', genotypes.shape)

# inspect subset of matrix values
print('Subset of matrix values:')
print(genotypes[:20, :20])

# save genotypes matrix and samples to use in pca.ipynb
np.save(f'data/chr{chr_num}_genotypes{variant_count}.npy', genotypes)
np.save('data/samples', samples)

Genotypes matrix shape: (2504, 1000000)
Subset of matrix values:
[[1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 