In [2]:
import pandas as pd
from collections import defaultdict
import numpy as np

In [24]:
pedigree = pd.read_csv("/expanse/projects/gymreklab/helia/TR_1000G/1000G.ped", delim_whitespace=True)
H3Africa_names = pd.read_csv("/expanse/projects/gymreklab/helia/H3Africa/names/H3A_Baylor_sample_country.txt", header=None, delim_whitespace=True)
pedigree = pedigree[['SampleID','Superpopulation', 'Population']]
samp_to_pop = pd.Series(pedigree.Superpopulation.values,index=pedigree.SampleID).to_dict()
samp_to_pop_afr = pd.Series(pedigree.Population.values,index=pedigree.SampleID).to_dict()

for index,row in H3Africa_names.iterrows():
    samp_to_pop[row[0]] = "H3Africa"

for index,row in H3Africa_names.iterrows():
    samp_to_pop_afr[row[0]] = row[1]

for samp in samp_to_pop:
    if samp_to_pop[samp] == "H3Africa" or samp_to_pop[samp] == "AFR":
        samp_to_pop[samp] = "AFR"
    else:
        samp_to_pop[samp] = "NON_AFR"
        
pops = ['AFR', 'NON_AFR']


In [26]:
def find_outlier(row):
    cn_dif = defaultdict(list)
    pop_n_outliar = []
    for pop in pops:
        pop_data = list(row[pop])
        for gbs in pop_data:
            if gbs != ".":
                cn_dif[pop].extend([int(x) for x in gbs.split("/")])

    ### Define outlier
    all_cns = cn_dif[pops[0]] + cn_dif[pops[1]]
    Q1 = np.percentile(all_cns, 25, interpolation = 'midpoint')
    Q3 = np.percentile(all_cns, 75,  interpolation = 'midpoint')
    IQR = Q3 - Q1
    strong_outlier_threshold = Q3 +3*IQR

    for pop in pops:
        pop_n_outliar.append(len([x for x in cn_dif[pop] if x > strong_outlier_threshold]) / len(cn_dif[pop]))

    return strong_outlier_threshold, pop_n_outliar[0], pop_n_outliar[1]

In [30]:
def read_file(gene_name, samp_to_pop_dict):
    header = []
    with open(f"{gene_name}_diff.txt") as f:
        for line in f:
            if line.startswith("chr"):
                    f.close()
                    break
            header.append(line.strip())


    header_pop = [samp_to_pop_dict[x] for x in header]
    header_pop = ["CHROM", "POS", "MOTIF"] + header_pop


    df = pd.read_csv(f"{gene_name}_diff.txt", skiprows=3550, sep = "\t", header = None)
    df = df.drop(3553, axis = 1)
    df.columns = header_pop


    return df

In [29]:
def freq_per_afr_pop(df, threshold):
    
    dict_pop = {}
    pops_afr = AFR_pops = ['Benin','Botswana','Burkina_Faso-Ghana','Cameroon_24', 'Cameroon_26', 'Mali','Nigeria','Zambia'] + \
           ["LWK","MSL","YRI","ESN","GWD"]
    for pop in pops_afr:
        cn_dif = []
        df_pop = list(df[pop].iloc[0])
        for gbs in df_pop:
            if gbs != ".":
                cn_dif.extend([int(x) for x in gbs.split("/")])
                
        dict_pop[pop] = len([x for x in cn_dif if x > threshold]) / len(cn_dif)
    
    return dict_pop
    

In [34]:
ca10 = read_file("ca10", samp_to_pop)
ca10_out = ca10.apply(lambda row: find_outlier(row), axis = 1)
print("threshold, AFR frequency, Non-AFR frequency:", list(ca10_out))
ca10 = read_file("ca10", samp_to_pop_afr)
display(freq_per_afr_pop(ca10, 90))

threshold, AFR frequency, Non-AFR frequency: [(90.0, 0.02135374697824335, 0.004343850022862369)]


{'Benin': 0.0,
 'Botswana': 0.010416666666666666,
 'Burkina_Faso-Ghana': 0.016666666666666666,
 'Cameroon_24': 0.020833333333333332,
 'Cameroon_26': 0.0,
 'Mali': 0.07,
 'Nigeria': 0.01020408163265306,
 'Zambia': 0.024390243902439025,
 'LWK': 0.015151515151515152,
 'MSL': 0.020202020202020204,
 'YRI': 0.019662921348314606,
 'ESN': 0.03355704697986577,
 'GWD': 0.025280898876404494}

In [37]:
nexn = read_file("nexn", samp_to_pop)
nexn_out = nexn.apply(lambda row: find_outlier(row), axis = 1)
print("threshold, AFR frequency, Non-AFR frequency:", list(nexn_out))
nexn = read_file("nexn", samp_to_pop_afr)
display(freq_per_afr_pop(nexn, 81))

threshold, AFR frequency, Non-AFR frequency: [(81.0, 0.1095890410958904, 0.014508445214378518)]


{'Benin': 0.2,
 'Botswana': 0.08333333333333333,
 'Burkina_Faso-Ghana': 0.09166666666666666,
 'Cameroon_24': 0.08333333333333333,
 'Cameroon_26': 0.17307692307692307,
 'Mali': 0.09,
 'Nigeria': 0.08163265306122448,
 'Zambia': 0.06097560975609756,
 'LWK': 0.13131313131313133,
 'MSL': 0.06565656565656566,
 'YRI': 0.14887640449438203,
 'ESN': 0.1174496644295302,
 'GWD': 0.11235955056179775}