In [79]:
%pylab inline
import pandas as pd

merged = pd.read_csv("TableS3-WGSvsCapillary.csv")

# Remove FXN, since no WGS calls
merged = merged[merged["PrimerID"]!="FXN"]

# Get dosages and check matches
def GetDosage(x):
    if x == "." or x == "./.": return np.nan
    else: return sum([int(float(item)) for item in x.split(",")])
    
merged["dosage.ensemble"] = merged["Ensemble"].apply(GetDosage)
merged["dosage.cap"] = merged["Cap.Binned"].apply(GetDosage)

Populating the interactive namespace from numpy and matplotlib


In [70]:
######## Output summary info ########
print("Number of loci: %s"%(len(set(merged["PrimerID"]))))
print("Number of samples: %s"%(len(set(merged["SampleID"]))))
print("Number of calls: %s"%(merged.shape[0]))

Number of loci: 49
Number of samples: 31
Number of calls: 1519


In [94]:
locdata = merged[merged["Ensemble"]!="./."].groupby(["PrimerID"], as_index=False).agg({"match.ensemble": np.sum, "SampleID": len})
locdata["acc"] = locdata.apply(lambda x: x["match.ensemble"]/x["SampleID"], 1)
locdata

Unnamed: 0,PrimerID,match.ensemble,SampleID,acc
0,ATN1,24,31,0.774194
1,ATXN10,31,31,1.0
2,C9orf72,0,31,0.0
3,CACNA1A,31,31,1.0
4,DMPK,30,31,0.967742
5,HTT,28,31,0.903226
6,JPH3,29,31,0.935484
7,PPP2R2B,31,31,1.0
8,SCA1,30,31,0.967742
9,SCA2,21,31,0.677419


In [None]:
######## Per-locus accuracy ########
loci = []
ens_call = []
ens_acc = []

for locus in set(merged["PrimerID"]):
    xx = merged[merged["PrimerID"]==locus]
    loci.append(locus)
    ens_call.append(xx[~np.isnan(xx["dosage.ensemble"])].shape[0])
    ens_acc.append(np.mean(xx[~np.isnan(xx["dosage.ensemble"])]["ensemble_match"]))

locdata = pd.DataFrame({"PrimerID": loci, "ens.call": ens_call, "ens.acc": ens_acc})
locdata = locdata.sort_values(["ens.acc"], ascending=False)

fig = plt.figure()
fig.set_size_inches((15, 5))
ax = fig.add_subplot(111)
ax.bar(range(locdata.shape[0]), locdata["ens.acc"], edgecolor="white", color='lightblue', width=0.5)
ax.set_xticks(range(locdata.shape[0]))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_position(('outward', 8))
ax.spines['bottom'].set_position(('outward', 5))
plt.rcParams['axes.linewidth']=0.8
ax.set_xticklabels(locdata["PrimerID"], rotation=90);

In [73]:
######## Sanity checks ########
#merged[merged["Cap"] != merged["Cap.Binned"]]

Unnamed: 0,PrimerID,SampleID,RefProductSize,period,offset,offset_hipstr,offset_gangstr,Prd,Cap,Cap.Binned,HipSTR,GangSTR,Ensemble,match.hipstr,match.gangstr,match.ensemble
1,ATN1,NA12891,158,3,8.0,6.0,6.0,136.8/151.42,-40,-50,-50,-50,-50,True,True,True
3,ATN1,NA12890,158,3,8.0,6.0,6.0,136.88/154.33,-41,-51,-50,-51,-50,False,True,False
5,ATN1,NA12877,158,3,8.0,6.0,6.0,136.84/160.28,-43,-53,-53,-53,-53,True,True,True
12,C9orf72,NA12892,161,6,12.0,11.0,11.0,147.22/164.87,03,.,22,22,22,,,
13,C9orf72,NA12891,161,6,12.0,11.0,11.0,146.86/164.48,03,.,-12,-12,-12,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,chr9_36061394_AC,NA19239,266,2,0.0,-3.0,-3.0,259.36/267.07,-31,-22,-22,-22,-22,True,True,True
1521,chr9_36061394_AC,HG03736,266,2,0.0,-3.0,-3.0,263.21/270.95,-12,04,04,04,04,True,True,True
1522,chr9_36061394_AC,NA18939,266,2,0.0,-3.0,-3.0,263.29/265.22,-10,01,01,11,01,True,False,True
1523,chr9_36061394_AC,HG00766,266,2,0.0,-3.0,-3.0,253.6/259.36,"-6,-3","-5,-2","-5,-2","-5,-2","-5,-2",True,True,True
