In [1]:
import pandas as pd
import numpy as np
from cyvcf2 import VCF

In [2]:
pemb = pd.read_csv("Pemberton_AdditionalFile1_11242009.txt", delim_whitespace=True)
pemb.shape

onekg_snp_info = pd.read_csv("STRPinfo.txt", delim_whitespace=True)
onekg_snp_info.shape

locus_ref = pd.merge(pemb, onekg_snp_info, how="inner", left_on="alternateName", right_on="MarkerName")[["alternateName", "lengthRefSeq(bp)", "expectedPCRfragmentSize_forRefSeq(bp)", "Chrom", "Bpposition", "Repeattype"]]
locus_ref.columns = ['id', 'reflen1', 'reflen2', 'chrom', 'pos', 'type']

# load the genotype data
gtData = pd.read_csv("STRPgenotypes.txt", delim_whitespace=True)
samples = [x for x in list(gtData.columns.values) if "NA" in x]
gtData = gtData.merge(locus_ref, how="inner", left_on="Name", right_on="id")

# round to the nearest repeat unit length
type_to_length = {'Di':2.0, 'Tetra':4.0, 'Tri':3.0, 'Penta':5.0}
for i in range(gtData.shape[0]):
    repLength = type_to_length[gtData.at[i, 'type']]
    for sample in samples:
        gt = gtData.at[i, sample]
        if gt == "0/0":
            gtData.at[i, sample] = "./."
        else:
            gt_new = [int(x)-gtData.at[i,"reflen1"] for x in gt.split("/")]
            gt_new_round = [int(np.round(x/repLength)*repLength) for x in gt_new]
            #if just one of the alleles was rounded, filter the call
            if gt_new[0]!= gt_new_round[0] and gt_new[1]!= gt_new_round[1]:
                gtData.at[i, sample] = "/".join(map(str,gt_new_round))
            elif gt_new[0]== gt_new_round[0] and gt_new[1]== gt_new_round[1]:
                gtData.at[i, sample] = "/".join(map(str,gt_new_round))
            else:
                gtData.at[i, sample] = "./." 

FileNotFoundError: File b'Pemberton_AdditionalFile1_11242009.txt' does not exist

In [None]:
# load imputation data for the samples available in Marshfield set
myDF = pd.DataFrame()
chrom_bak = 0
for i in range(gtData.shape[0]):
    chrom = gtData.at[i,'chrom']
    ID = gtData.at[i,'Name']
    if chrom != chrom_bak:
        strReg = pd.read_csv("/storage/s1saini/str-imputation/hipstr_template/str_regions_bed/HipSTR.chr"+str(chrom)+".txt", delim_whitespace=True, names=['chrom', 'start', 'end', 'type', 'reflen', 'ID'])
    strRegPos = strReg[strReg['ID']==ID]['start']
    if strRegPos.shape[0] == 0:
        continue
    position = int(strRegPos)
    vcf = VCF('/storage/s1saini/manuscript_strsnp/fig3/1kg.panel.anno/1kg.snp.str.chr'+str(chrom)+'.vcf.gz', samples=samples)
    for variant in vcf(str(chrom)+":"+str(position)+"-"+str(position)):
        if variant.ID == ID:
            tmp1 = [x.split("|") for x in variant.gt_bases]
            gt = ["/".join(map(str,[len(x)-len(variant.REF) for x in gt])) for gt in tmp1]
            myDF = myDF.append(pd.DataFrame([dict(zip(["Name"]+vcf.samples, [variant.ID]+gt))], columns=dict(zip(["Name"]+vcf.samples, [variant.ID]+gt)).keys()), ignore_index=True)
            break
    chrom_bak = chrom

In [None]:
common_samples = set([x for x in list(gtData.columns.values) if "NA" in x]).intersection(set([x for x in list(myDF.columns.values) if "NA" in x]))

In [None]:
#mergedData = pd.DataFrame()
#for i in range(gtData.shape[0]):
#    for sample in common_samples:
#        gral1, gral2 = gtData.at[i, sample].split("/")
#        if myDF[myDF['Name']==gtData.at[i,"Name"]].shape[0] != 0:
#            imal1, imal2 = myDF[myDF['Name']==gtData.at[i,"Name"]][sample].values[0].split("/")
#            mergedData = mergedData.append(pd.DataFrame([{'name':gtData.at[i,"Name"], 'sample':sample, 'gral1':gral1, 'gral2':gral2, 'imal1':imal1, 'imal2':imal2}]))
#            
#mergedData.to_csv("capillary_vs_imputed_calls.csv", index=False, columns=['sample', 'name', 'gral1', 'gral2', 'imal1', 'imal2'])



In [None]:
mergedData = pd.read_csv("capillary_vs_imputed_calls.csv")
mergedData.head()

In [None]:
#concord = list()
#droppedNa = mergedData[mergedData!="."].dropna(axis=0)
#for i in droppedNa.values:
#    listA = set(i[0:2].astype(int))
#    listB = set(i[2:4].astype(int))
#    concord.append( (2-(max(len(listA-listB) , len(listB-listA))))/2.0 )
#
#concordance = pd.DataFrame({'str':droppedNa['name'], 'concord':concord})
#concordance = concordance.groupby('str').mean().reset_index()
#concordance.to_csv("capillary_vs_imputed_concordance.csv", index=False)

In [None]:
concordance = pd.read_csv("capillary_vs_imputed_concordance.csv")
concordance.head()

In [None]:
impute_performace = pd.read_csv("/storage/s1saini/manuscript_strsnp/fig3/final_numbers/ALL.results.csv")[['str', 'wgs_eur_concordance', 'wgs_afr_concordance', 'wgs_eas_concordance']]
impute_performace = impute_performace.merge(concordance, on="str", how="inner")
impute_performace.head()

In [None]:
print "Marfield Concordance", np.mean(impute_performace.concord.values)
print "hipSTR vs impute (EUR) Concordance", np.nanmean(impute_performace.wgs_eur_concordance.values)
print "hipSTR vs impute (EAS) Concordance", np.nanmean(impute_performace.wgs_eas_concordance.values)
print "hipSTR vs impute (AFR) Concordance", np.nanmean(impute_performace.wgs_afr_concordance.values)

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

sns.lmplot(x='concord', y='wgs_eur_concordance', data=impute_performace)