# Gene level summaries of variant files

In [11]:
from tqdm import tqdm
import collections
import csv
import numpy as np

AFR_ABUNDANT = 0.2
AFR_OVERREPRESENTED = 8

genes = {}

total_variants = collections.defaultdict(int)
intron_variants = collections.defaultdict(int)
missense_variants = collections.defaultdict(int)
other_variants = collections.defaultdict(int)

afr_abundant_variants = collections.defaultdict(int)
afr_abundant_intron_variants = collections.defaultdict(int)
afr_abundant_missense_variants = collections.defaultdict(int)
afr_abundant_other_variants = collections.defaultdict(int)

afr_specific_variants = collections.defaultdict(int)
afr_specific_intron_variants = collections.defaultdict(int)
afr_specific_missense_variants = collections.defaultdict(int)
afr_specific_other_variants = collections.defaultdict(int)
afr_specific_missense_variants_mutations = collections.defaultdict(list)

snpeff_file = "results/subset_snvs_protein_coding_1kGPhg38.tsv"

with open(snpeff_file, 'r') as f:
    reader = csv.reader(f, delimiter="\t")
    header = next(reader)
    biotype_index = header.index("ANN[*].BIOTYPE")
    gene_id_index = header.index("ANN[*].GENEID")
    gene_index = header.index("ANN[*].GENE")
    effect_index = header.index("ANN[*].EFFECT")
    missense_mutations_index = header.index("ANN[*].HGVS_P")
    afr_af_idx = header.index("AFR_AF")
    other_af_idx = [header.index(af) for af in ['EAS_AF', 'EUR_AF', 'AMR_AF', 'SAS_AF']]
    c = 0
    for row in tqdm(reader):
        if c == 10000:
            pass
        c += 1
        biotype = row[biotype_index]
        if biotype != "protein_coding":
            continue
        gene_id = row[gene_id_index]
        genes[gene_id] = row[gene_index]
        # check variant type
        effect = row[effect_index]
        if effect == "intron_variant":
            is_intron_variant = True
        else:
            is_intron_variant = False
        if effect == "missense_variant":
            is_missense_variant = True
        else:
            is_missense_variant = False
        if effect == "other_variant":
            is_other_variant = True
        else:
            is_other_variant = False
        # check africa abundance and specificity
        afr_af = float(row[afr_af_idx])
        if afr_af >= AFR_ABUNDANT:
            is_afr_abundant = True
            other_af = max(np.max([float(row[o]) for o in other_af_idx]), 1e-8)
            overrepresentation = afr_af / other_af
            if overrepresentation >= AFR_OVERREPRESENTED:
                is_afr_specific = True
            else:
                is_afr_specific = False
        else:
            is_afr_abundant = False
            is_afr_specific = False
        # counters
        total_variants[gene_id] += 1
        if is_intron_variant:
            intron_variants[gene_id] += 1
        if is_missense_variant:
            missense_variants[gene_id] += 1
        if is_other_variant:
            other_variants[gene_id] += 1
        if is_afr_abundant:
            afr_abundant_variants[gene_id] += 1
            if is_intron_variant:
                afr_abundant_intron_variants[gene_id] += 1
            if is_missense_variant:
                afr_abundant_missense_variants[gene_id] += 1
            if is_other_variant:
                afr_abundant_other_variants[gene_id] += 1
        if is_afr_specific:
            afr_specific_variants[gene_id] += 1
            if is_intron_variant:
                afr_specific_intron_variants[gene_id] += 1
            if is_missense_variant:
                afr_specific_missense_variants[gene_id] += 1
                afr_specific_missense_variants_mutations[gene_id].append(row[missense_mutations_index])
            if is_other_variant:
                afr_specific_other_variants[gene_id] += 1

afr_specific_missense_variants_mutations = {k: ";".join([x for x in v if x != "."]) for k,v in afr_specific_missense_variants_mutations.items()}
for k,v in genes.items():
    if k not in afr_specific_missense_variants_mutations:
        afr_specific_missense_variants_mutations[k] = ""

R = []
for k,v in genes.items():
    R.append([k, v, total_variants[k], intron_variants[k], missense_variants[k], other_variants[k], afr_abundant_variants[k], afr_abundant_intron_variants[k], afr_abundant_missense_variants[k], afr_abundant_other_variants[k], afr_specific_variants[k], afr_specific_intron_variants[k], afr_specific_missense_variants[k], afr_specific_other_variants[k], afr_specific_missense_variants_mutations[k]])

with open(snpeff_file.split(".tsv")[0]+"_gene_level.tsv", 'w') as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["ensemble_id", "gene", "total_variants", "intron_variants", "missense_variants", "other_variants", "afr_abundant_variants", "afr_abundant_intron_variants", "afr_abundant_missense_variants", "afr_abundant_other_variants", "afr_specific_variants", "afr_specific_intron_variants", "afr_specific_missense_variants", "afr_specific_other_variants", "afr_specific_missense_variants_mutations"])
    writer.writerows(R)

1706781it [00:14, 121248.23it/s]
