In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re

In [3]:

pops = ['AFR', 'AMR', 'EUR', 'EAS', 'SAS']
pop_names = defaultdict(list)
for pop in pops:
    with open(f"/expanse/projects/gymreklab/helia/ensembl/1000Genomes-TR-Analysis/phasing/validation/{pop}_names.txt") as f:
        for line in f:
            pop_names[pop].append(line.strip())
    f.close()

In [4]:
def check_concordance(result):
    gts = [int(gt) for gt in result[1:3]]
    imp = [int(im) for im in result[3:]]
    gts.sort()
    imp.sort() 
    if gts == imp:
        return 1
    if gts[0] in imp or gts[1] in imp:
        return 0.5
    return 0

In [33]:
##### Load pathogenic loci #####
capillary = pd.read_csv("capillary_loci.txt", header=None)
capillary.columns = ['Chrom', 'Start', 'End']
full_info = pd.read_csv("/expanse/projects/gymreklab/helia/ensembl/1000Genomes-TR-Analysis/capillary/1000g_loci.csv")
def find_name(coordinates):
    chrom, start, end = coordinates
    candidates = full_info[full_info['Chrom'] == chrom]
    for index,row in candidates.iterrows():
        if abs(row['Start'] - start) < 100:
            return row['LocusID']
capillary['ID'] = capillary.apply(lambda row: find_name([row['Chrom'], row['Start'], row['End']]), axis = 1)
capillary

Unnamed: 0,Chrom,Start,End,ID
0,chr2,176093058,176093099,HOXD13
1,chr3,63912685,63912716,SCA7
2,chr4,3074877,3074968,HTT
3,chr4,41745976,41746022,PHOX2B
4,chr5,146878728,146878759,PPP2R2B
5,chr6,16327634,16327723,SCA1
6,chr6,45422749,45422794,RUNX2
7,chr9,27573485,27573546,C9orf72
8,chr9,69037287,69037304,FXN
9,chr12,6936717,6936773,ATN1


In [18]:
impute_df = pd.DataFrame(columns = ['locus', 'AFR_conc', 'AMR_conc', 'SAS_conc', 'EAS_conc'])
for chrom in range(1,23):
    if chrom == 13 or chrom == 21: # No pathogenic record
        continue
    pathogenic_loci = pd.read_csv(f"/expanse/projects/gymreklab/helia/ensembl/1000Genomes-TR-Analysis/phasing/pathogenic_imputation/pathogenic_loci/loci_chr{chrom}.txt",
                                 header = None, sep = "\t")
    impute_results_per_chr = defaultdict(list)
    for index, row in pathogenic_loci.iterrows():
        locus = f"{row[0]}:{row[1]}-{row[2]}"
        for pop in pops:
            impute_per_locus_per_pop = defaultdict(list)
            for sample in pop_names[pop][0:len(pop_names[pop])-1]: 
                with open(f"/expanse/projects/gymreklab/helia/ensembl/1000Genomes-TR-Analysis/phasing/pathogenic_imputation/diff_{pop}/{chrom}/{sample}.{locus}.diff.txt") as f:
                    impute_result = f.readlines()
                    if len(impute_result) > 1:
                        print("shit happened here")
                    elif len(impute_result) == 0:
                        impute_per_locus_per_pop[locus].append(np.nan)
                    else:
                        impute_result = impute_result[0].strip().split()
                        impute_per_locus_per_pop[locus].append(check_concordance(impute_result))
            for locus in impute_per_locus_per_pop:
                impute_results_per_chr[locus].append(np.nanmean(impute_per_locus_per_pop[locus]))
                
    df = pd.DataFrame(impute_results_per_chr)
    df = df.transpose().reset_index()
    df.columns = ['locus', 'AFR_conc', 'AMR_conc', 'EUR_conc' , 'EAS_conc','SAS_conc']
    impute_df = pd.concat([impute_df, df])    

  impute_results_per_chr[locus].append(np.nanmean(impute_per_locus_per_pop[locus]))


In [38]:
def find_ID(locus):
    locus = locus.split(":")
    start = locus[1].split("-")[0]
    if int(start) in list(capillary['Start']):
        return capillary[capillary['Start'] == int(start)].iloc[0]['ID']
    else:
        return "Blood_trait"
not_na_impute_df = impute_df.reset_index(drop=True).dropna()
not_na_impute_df['ID'] = not_na_impute_df.apply(lambda row: find_ID(row['locus']), axis = 1)
not_na_impute_df[not_na_impute_df['ID'] != "Blood_trait"]
not_na_impute_df = not_na_impute_df[['locus', 'ID', 'AFR_conc', 'AMR_conc', 'SAS_conc', 'EAS_conc', 'EUR_conc']]
not_na_impute_df
not_na_impute_df.to_csv("pathogenic_impute.csv", index = False)

In [39]:
with open("missed_loci.txt", "w") as f:
    for locus in list(impute_df[impute_df['AFR_conc'].isna()]['locus']):
        chrom, pos, end = re.split('[:-]', locus)
        f.write(f"{chrom}\t{pos}\t{end}")
        f.write("\n")
f.close()
    

In [48]:
not_na_impute_df.sort_values('AFR_conc')

Unnamed: 0,locus,ID,AFR_conc,AMR_conc,SAS_conc,EAS_conc,EUR_conc
71,chr14:92071009-92071053,SCA3,0.681818,0.904040,0.777778,0.767677,0.888889
107,chr22:45795355-45795424,ATXN10,0.737374,0.676768,0.671717,0.575758,0.737374
4,chr1:153927590-153927642,Blood_trait,0.767677,0.808081,0.722222,0.651515,0.737374
39,chr4:3074877-3074968,HTT,0.782828,0.803030,0.717172,0.813131,0.747475
96,chr18:69868803-69868841,Blood_trait,0.792929,0.909091,0.868687,0.959596,0.888889
...,...,...,...,...,...,...,...
72,chr14:23321472-23321492,PAPBN1,1.000000,0.994949,0.994949,1.000000,1.000000
73,chr14:64247333-64247356,Blood_trait,1.000000,0.994949,1.000000,1.000000,0.994949
12,chr2:176093058-176093099,HOXD13,1.000000,1.000000,0.994949,1.000000,1.000000
13,chr2:21043880-21043925,Blood_trait,1.000000,0.994949,0.994949,0.994949,0.989899
