# Get the subset of African Abundant and African Specific variants from 1000 Genomes

In this notebook we simply extract African Abundant (AF >= 20%) and African Specific (AF >= 20%, OR >= 8x) variants.

In [8]:
import csv
import numpy as np
from tqdm import tqdm
import os

ABUNDANCE_PROPORTION = 0.2
SPECIFICITY_OVERREPRESENTATION = 8

# Define the result folder path
result_folder = 'results'

file_path = 'subset_snvs_protein_coding_1kGPhg38.tsv'

# Columns
AFRICAN_AF = 'AFR_AF'
OTHER_AF = ['EAS_AF', 'EUR_AF', 'AMR_AF', 'SAS_AF']

# Read the file
R = []
with open(file_path, 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    header = next(reader)
    print(header)
    afr_af_idx = header.index(AFRICAN_AF)
    other_af_idx = [header.index(o) for o in OTHER_AF]
    for i, r in tqdm(enumerate(reader)):
        afr_af = float(r[afr_af_idx])
        if afr_af < ABUNDANCE_PROPORTION:
            continue
        other_af = max(np.max([float(r[o]) for o in other_af_idx]), 1e-8)
        overrepresenation = afr_af / other_af
        is_abundant = 1
        if overrepresenation >= SPECIFICITY_OVERREPRESENTATION:
            is_specific = 1
        else:
            is_specific = 0
        R += [r + [is_abundant, is_specific]]

# Create the result folder if it doesn't exist
if not os.path.exists(result_folder):
    os.makedirs(result_folder)

# Define the result file path
result_file_path = os.path.join(result_folder, 'subset_snvs_protein_coding_1kGPhg38_afr_abundant_specific.tsv')

# Write the file with the additional columns
with open(result_file_path, 'w') as file:
    writer = csv.writer(file, delimiter='\t')
    writer.writerow(header + ['IS_ARF_ABUNDANT', 'IS_AFR_SPECIFIC'])
    writer.writerows(R)



['SAMPLE', 'CHROM', 'POS', 'REF', 'ALT', 'ANN[*].GENE', 'ANN[*].GENEID', 'ANN[*].FEATURE', 'ANN[*].FEATUREID', 'ANN[*].BIOTYPE', 'ANN[*].EFFECT', 'ANN[*].IMPACT', 'ANN[*].RANK', 'ANN[*].HGVS_C', 'ANN[*].HGVS_P', 'ANN[*].CDNA_POS', 'ANN[*].CDNA_LEN', 'ANN[*].CDS_POS', 'ANN[*].CDS_LEN', 'ANN[*].AA_POS', 'ANN[*].AA_LEN', 'ANN[*].DISTANCE', 'ANN[*].ALLELE', 'ANN[*].ERRORS', 'ID', 'AF', 'AC', 'NS', 'AN', 'EAS_AF', 'EUR_AF', 'AFR_AF', 'AMR_AF', 'SAS_AF', 'dbNSFP_gnomAD_exomes_AC', 'dbNSFP_gnomAD_exomes_AN', 'dbNSFP_gnomAD_exomes_AF', 'dbNSFP_gnomAD_exomes_POPMAX_AC', 'dbNSFP_gnomAD_exomes_POPMAX_AN', 'dbNSFP_gnomAD_exomes_POPMAX_AF', 'dbNSFP_gnomAD_exomes_AFR_AC', 'dbNSFP_gnomAD_exomes_AFR_AN', 'dbNSFP_gnomAD_exomes_AFR_AF', 'dbNSFP_gnomAD_exomes_NFE_AC', 'dbNSFP_gnomAD_exomes_NFE_AN', 'dbNSFP_gnomAD_exomes_NFE_AF', 'dbNSFP_gnomAD_exomes_AMR_AC', 'dbNSFP_gnomAD_exomes_AMR_AN', 'dbNSFP_gnomAD_exomes_AMR_AF', 'dbNSFP_gnomAD_exomes_ASJ_AC', 'dbNSFP_gnomAD_exomes_ASJ_AN', 'dbNSFP_gnomAD_exomes_A

32577572it [02:43, 199749.37it/s]
