In [2]:
import numpy as np
import pandas as pd
import gzip

In [None]:
# download data if needed
chr_num = 2 # which chromosome's data to download
!wget -P data/ http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr{chrom_num}.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz

In [3]:
# open the VCF file
vcf_file = gzip.open('data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz', 'rt')
print('File opened!')

File opened!


In [None]:
variant_count = 500000 # number of variants to use

# extract the samples and variants
samples = []
variants = []

for count, line in enumerate(vcf_file):
    if count % 10000 == 0:
        print(f'Lines processed: {count}')
        
    if line.startswith('#CHROM'): # get sample ids
        samples = line.strip().split()[9:]
    
    elif not line.startswith('#'): # get genotypes at each variant
        variants.append(line.strip().split())

    if len(variants) == variant_count:
        break

vcf_file.close()
print('Done extracting!')

In [None]:
variant_count = 500000 # number of variants to use

# extract the samples and variants
samples = []
variants = []

for count, line in enumerate(vcf_file):
    if count % 10000 == 0:
        print(f'Lines processed: {count}')
        
    if line.startswith('#CHROM'): # get sample ids
        samples = line.strip().split()[9:]
    
    elif not line.startswith('#'): # get genotypes at each variant
        variants.append(line.strip().split())

    if len(variants) == variant_count:
        break

vcf_file.close()
print('Done extracting!')

In [None]:
"""
Create matrix containing a numeric representation of genotypes for each individual at each variant site
Genotype of 0: Homozygous reference genotype, both alleles at the variant site are identical to the reference allele
Genotype of 1: Heterozygous genotype, only one allele at the variant site is identical to the reference allele
Genotype of 2: Homozygous alternate genotype, both alleles at the variant site are different from the reference allele
"""
genotypes = np.zeros((len(samples), len(variants)), dtype=np.float32)
for i, variant in enumerate(variants):
    for j in range(len(samples)):
        call = variant[9+j].split(':')[0]
        if call == './.' or call == './. ./.': # missing or invalid genotype at variant site
            print(f'Skipping sample {j} for variant {i} due to missing call: {call}')
            genotypes[j, i] = np.nan
        else:
            alleles = call.split('|')
            genotypes[j, i] = sum(map(int, alleles))

# filter out samples with missing/invalid genotypes at any variant site
genotypes = genotypes[~np.isnan(genotypes).any(axis=1), :]

print('Matrix created!')

In [None]:
# dimensions of genotypes matrix
print('Genotypes matrix shape:', genotypes.shape)

# inspect subset of matrix values
print('Subset of matrix values:')
print(genotypes[:20, :20])

# save genotypes matrix and samples to use in pca.ipynb
np.save(f'data/chr{chr_num}_genotypes{variant_count}.npy', genotypes)
np.save('data/samples', samples)