In [4]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import numpy as np
import os
import pandas as pd
import re
from collections import defaultdict

In [2]:
######### Load data of Mendelian Inheritance #######
path = f"/expanse/projects/gymreklab/helia/ensembl/1000Genomes-TR-Analysis/mend/result/mend_gt_chr1_extra_info_r2.tab"
data = pd.read_csv(path, sep="\t", 
    names=["chrom","pos","motif","sample","MI","Score","gbs", "method", "raw_calls", "allele_support"])
data

Unnamed: 0,chrom,pos,motif,sample,MI,Score,gbs,method,raw_calls,allele_support
0,chr1,899025,A,HG00405,True,1.00,"1/0,1/0,1/0",0|0|1|1,"hipstr=23.0,24.0|gangstr=21.0,22.0;hipstr=23.0...","0|0|1|1,0|0|1|1,0|0|1|1"
1,chr1,899025,A,HG00408,True,0.55,"0/0,1/0,1/0",0|0|1|1,"hipstr=23.0,23.0|gangstr=20.0,21.0;hipstr=23.0...","0|0|1|0,0|0|1|1,0|0|1|1"
2,chr1,899025,A,HG00423,True,1.00,"0/0,1/0,0/0",0|0|1|1,"hipstr=23.0,23.0|gangstr=21.0,21.0;hipstr=23.0...","0|0|1|1,0|0|1|1,0|0|1|1"
3,chr1,899025,A,HG00438,True,1.00,"1/0,1/0,0/0",0|0|1|1,"hipstr=23.0,24.0|gangstr=21.0,22.0;hipstr=23.0...","0|0|1|1,0|0|1|1,0|0|1|1"
4,chr1,899025,A,HG00450,True,1.00,"1/0,0/0,1/0",0|0|1|1,"hipstr=23.0,24.0|gangstr=21.0,22.0;hipstr=23.0...","0|0|1|1,0|0|1|1,0|0|1|1"
...,...,...,...,...,...,...,...,...,...,...
31585950,chr1,248916402,A,HG03825,True,1.00,"0/0,0/0,0/-1",0|0|1|0,"hipstr=11.0,11.0;hipstr=11.0,11.0;hipstr=11.0,...","0|0|1|0,0|0|1|0,0|0|1|0"
31585951,chr1,248916402,A,HG04157,True,1.00,"0/0,0/-1,0/0",0|0|1|0,"hipstr=11.0,11.0;hipstr=11.0,10.0;hipstr=11.0,...","0|0|1|0,0|0|1|0,0|0|1|0"
31585952,chr1,248916402,A,NA12801,True,1.00,"0/0,0/0,0/-1",0|0|1|0,"hipstr=11.0,11.0;hipstr=11.0,11.0;hipstr=11.0,...","0|0|1|0,0|0|1|0,0|0|1|0"
31585953,chr1,248916402,A,NA12802,True,1.00,"0/-1,0/-1,0/0",0|0|1|0,"hipstr=11.0,10.0;hipstr=11.0,10.0;hipstr=11.0,...","0|0|1|0,0|0|1|0,0|0|1|0"


In [3]:
####### Select calls that are genotyped by at least two callers, and those callers have conflict ######
def check_conflict(allele_supports, method):
    allele_supports = allele_supports.split(",")
    for a_s in allele_supports:
        if a_s != method:
            return True
    return False
called_by_at_least_two = data[~data['method'].isin(["0|0|1|0", "0|0|0|1", "1|0|0|0", "0|1|0|0"])]
called_by_at_least_two['conflict'] = called_by_at_least_two.apply(lambda row: check_conflict(row['allele_support'], 
                                                                                 row['method']), axis = 1)
conflict = called_by_at_least_two[called_by_at_least_two['conflict'] == True]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  called_by_at_least_two['conflict'] = called_by_at_least_two.apply(lambda row: check_conflict(row['allele_support'],


In [10]:
######## Calculate the Mendelian consistenct of initial callers #########
def compute_mendelian(calls):
    if np.nan in calls:
        return np.nan
    else:
        sample, mother, father = calls
        if (sample[0] in mother and sample[1] in father) or (sample[0] in father and sample[1] in mother):
            return True
        return False
def analyse_raw_calls_mendelian(raw_calls):
    raw_calls = raw_calls.split(";")
    callers = defaultdict(list)
    for raw_call in raw_calls:
        raw_call = raw_call.split("|")
        for pair in raw_call:
            caller, call = pair.split("=")
            if call == ".":
                callers[caller].append(np.nan)
            else:
                callers[caller].append(call.split(","))
    mendelian_consistency = {}
    for key in callers:
        mendelian_consistency[key] = compute_mendelian(callers[key])
    return mendelian_consistency

called_by_at_least_two['raw_mend'] = called_by_at_least_two.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  called_by_at_least_two['raw_mend'] = called_by_at_least_two.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)


In [11]:
###### Calculate the length consistency on EnsembleTR results ######
def gbs_MI(gbs):
    sample, mom, dad = gbs.split(",")
    sample = sample.split("/")
    mom = mom.split("/")
    dad = dad.split("/")
    if (sample[0] in mom and sample[1] in dad) or (sample[1] in mom and sample[0] in dad):
        return True
    return False
called_by_at_least_two['gbs_MI'] = called_by_at_least_two.apply(lambda x: gbs_MI(x['gbs']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  called_by_at_least_two['gbs_MI'] = called_by_at_least_two.apply(lambda x: gbs_MI(x['gbs']), axis = 1)


In [14]:
######### Analyze mendelian consistency in combinations of tools ########
def check(raw, caller1, caller2, caller3 = None):
    if caller3:
        if len(raw.keys()) == 3 and caller1 in raw.keys() and caller3 in raw.keys() and caller2 in raw.keys() and ~np.isnan(raw[caller1]) and ~np.isnan(raw[caller2]) and ~np.isnan(raw[caller3]):
            return True
        else:
            return False
    else:
        if len(raw.keys()) == 2 and caller1 in raw.keys() and caller2 in raw.keys() and ~np.isnan(raw[caller1]) and ~np.isnan(raw[caller2]):
            return True
    return False

def analyse_combinations(caller1, caller2, caller3 = None):
    mutual_calls = called_by_at_least_two[called_by_at_least_two.apply(lambda x: check(x['raw_mend'], caller1, caller2, caller3), axis = 1)]    
    
    Ensemble_true = len(mutual_calls[mutual_calls['gbs_MI'] == True])/ len(mutual_calls)
    caller1_true = len(mutual_calls[mutual_calls['raw_mend'].apply(lambda x: (x.get(caller1) == True))])/ len(mutual_calls)
    caller2_true = len(mutual_calls[mutual_calls['raw_mend'].apply(lambda x: (x.get(caller2) == True))])/ len(mutual_calls)
    caller3_true = None
    
    if caller3:
        caller3_true = len(mutual_calls[mutual_calls['raw_mend'].apply(lambda x: (x.get(caller3) == True))])/ len(mutual_calls)
    return [len(mutual_calls), Ensemble_true, caller1_true, caller2_true, caller3_true]


In [17]:
combinations = [['hipstr', 'gangstr'], ['hipstr', 'eh'], ['eh', 'gangstr'], ['hipstr', 'gangstr', 'eh']]
for comb in combinations:
    print(comb)
    if len(comb) == 2:
        print(analyse_combinations(comb[0], comb[1]))
    if len(comb) == 3:
        print(analyse_combinations(comb[0], comb[1], comb[2]))

['hipstr', 'gangstr']
[15250413, 0.9931508740123955, 0.9915409503991793, 0.9813119159461452, None]
['hipstr', 'eh']
[385536, 0.995722837815405, 0.9959899983399735, 0.9894847692563081, None]
['eh', 'gangstr']
[76117, 0.9660522616498286, 0.9859821064939501, 0.9465559599038322, None]
['hipstr', 'gangstr', 'eh']
[1009758, 0.9959019883972199, 0.9959584375662287, 0.9818055415257914, 0.993241945099717]


In [18]:
def no_nan(raw_mend):
    for v in raw_mend.values():
        if np.isnan(v):
            return False
    return True
def had_false(raw_mend):
    for v in raw_mend.values():
        if v == False:
            return True
    return False
called_by_at_least_two_no_nan = called_by_at_least_two[called_by_at_least_two.apply(lambda x: no_nan(x['raw_mend']), 
                                                                                    axis = 1)]
called_by_at_least_two_no_nan_had_false = called_by_at_least_two_no_nan[called_by_at_least_two_no_nan.apply(lambda x: had_false(x['raw_mend']), 
                                                                                                            axis = 1)]

corrected_by_Ensemble = len(called_by_at_least_two_no_nan_had_false[called_by_at_least_two_no_nan_had_false['gbs_MI'] == 
                                                                True]) / len(called_by_at_least_two_no_nan_had_false)
print(f"{corrected_by_Ensemble}% of calls with at least one caller with False MI were corrected by EnsembleTR")

corrected_by_hipstr = len(called_by_at_least_two_no_nan_had_false[(called_by_at_least_two_no_nan_had_false['raw_mend'].apply(lambda x: (x.get('hipstr') == True)))])/len(called_by_at_least_two_no_nan_had_false[called_by_at_least_two_no_nan_had_false['raw_mend'].apply(lambda x: (x.get('hipstr') in [True, False]))])

print(f"{corrected_by_hipstr}% of calls with at least one caller with False MI were corrected by HipSTR")

0.7210205042328083% of calls with at least one caller with False MI were corrected by EnsembleTR


In [47]:
hipstr_gangstr = called_by_at_least_two[called_by_at_least_two.apply(lambda x: check(x['raw_mend'], 'hipstr', 'gangstr'), axis = 1)]
print(len(hipstr_gangstr[hipstr_gangstr['raw_mend'].apply(lambda x: (x.get('hipstr') == True))])/len(hipstr_gangstr))
print(len(hipstr_gangstr[hipstr_gangstr['raw_mend'].apply(lambda x: (x.get('gangstr') == True))])/len(hipstr_gangstr))

0.9915409503991793
0.9813119159461452


In [42]:
hipstr_eh = called_by_at_least_two[called_by_at_least_two.apply(lambda x: check(x['raw_mend'], 'hipstr', 'eh'), axis = 1)]
print(len(hipstr_eh[hipstr_eh['raw_mend'].apply(lambda x: (x.get('hipstr') == True))])/len(hipstr_eh))
print(len(hipstr_eh[hipstr_eh['raw_mend'].apply(lambda x: (x.get('eh') == True))])/len(hipstr_eh))

0.9959899983399735
0.9894847692563081


In [43]:
gangstr_eh = called_by_at_least_two[called_by_at_least_two.apply(lambda x: check(x['raw_mend'], 'gangstr', 'eh'), axis = 1)]
print(len(gangstr_eh[gangstr_eh['raw_mend'].apply(lambda x: (x.get('gangstr') == True))])/len(gangstr_eh))
print(len(gangstr_eh[gangstr_eh['raw_mend'].apply(lambda x: (x.get('eh') == True))])/len(gangstr_eh))

0.9465559599038322
0.9859821064939501


In [44]:
gangstr_eh_hipstr = called_by_at_least_two[called_by_at_least_two.apply(lambda x: check(x['raw_mend'], 'gangstr', 'eh','hipstr'), axis = 1)]
print(len(gangstr_eh_hipstr[gangstr_eh_hipstr['raw_mend'].apply(lambda x: (x.get('gangstr') == True))])/len(gangstr_eh_hipstr))
print(len(gangstr_eh_hipstr[gangstr_eh_hipstr['raw_mend'].apply(lambda x: (x.get('eh') == True))])/len(gangstr_eh_hipstr))
print(len(gangstr_eh_hipstr[gangstr_eh_hipstr['raw_mend'].apply(lambda x: (x.get('hipstr') == True))])/len(gangstr_eh_hipstr))


0.9818055415257914
0.993241945099717
0.9959584375662287


In [46]:
hipstr_only = data[data['method'] == "0|0|1|0"]
hipstr_only['raw_mend'] = hipstr_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)
print(len(hipstr_only[hipstr_only['raw_mend'].apply(lambda x: (x.get('hipstr') == True))])/len(hipstr_only))


gangstr_only = data[data['method'] == "0|0|0|1"]
gangstr_only['raw_mend'] = gangstr_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)
print(len(gangstr_only[gangstr_only['raw_mend'].apply(lambda x: (x.get('gangstr') == True))])/len(gangstr_only))

eh_only = data[data['method'] == "0|1|0|0"]
eh_only['raw_mend'] = eh_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)
print(len(eh_only[eh_only['raw_mend'].apply(lambda x: (x.get('eh') == True))])/len(eh_only))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hipstr_only['raw_mend'] = hipstr_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)


0.9918981945369209


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gangstr_only['raw_mend'] = gangstr_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)


0.8802109552733808
0.9825446930866358


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eh_only['raw_mend'] = eh_only.apply(lambda x: analyse_raw_calls_mendelian(x['raw_calls']), axis = 1)
