In [1]:
# Colors (use colors from GTeX e.g. http://science.sciencemag.org/content/348/6235/648.full)
COLORS = {
    "Artery-Aorta":"salmon",
    "Artery-Tibial": "red",
    "Adipose-Subcutaneous": "darkorange",    
    "Adipose-Visceral":"orange",
    "Brain-Caudate":"lemonchiffon"   , 
    "Brain-Cerebellum":"yellow",
    "Cells-Transformedfibroblasts": "skyblue",
    "Esophagus-Mucosa": "sienna",
    "Esophagus-Muscularis":"burlywood",
    "Heart-LeftVentricle":"darkviolet",
    "Lung": "greenyellow",
    "Muscle-Skeletal": "mediumslateblue",
    "Nerve-Tibial":"gold",
    "Skin-NotSunExposed":"blue",
    "Skin-SunExposed":"cornflowerblue",
    "Thyroid":"green",
    "WholeBlood": "m",
    "permuted": "gray"
}

#    "Thyroid": "green",
SHORTEN = {
    "Artery-Aorta":"Artery.A"     ,
    "Artery-Tibial": "Artery.T",
    "Adipose-Subcutaneous": "Adipose.S",    
    "Adipose-Visceral":"Adipose.V",
    "Brain-Caudate":"Caudate"   , 
    "Brain-Cerebellum":"Cerebellum",
    "Cells-Transformedfibroblasts": "Fibroblast",
    "Esophagus-Mucosa": "Mucosa",
    "Esophagus-Muscularis":"Muscularis",
    "Heart-LeftVentricle":"Heart",
    "Lung": "Lung",
    "Muscle-Skeletal": "Muscle",
    "Nerve-Tibial":"Nerve",
    "Skin-NotSunExposed": "SkinUnexposed",
    "Skin-SunExposed":"SkinLeg",
    "Thyroid":"Thyroid",
    "WholeBlood": "Blood",
    "permuted":"Permuted",
    "LCL": "LCL"
}
TISSUES = [item for item in list(COLORS.keys()) if item != "permuted"]

def ExtractData(gene, chrom, start, tissue):
    if "(" in tissue: tissue = tissue.split("(")[0]
    vcf = "/storage/szfeupe/Runs/650GTEx_estr/Merged_STRs_all_samples.vcf.gz"
    # Pull out STR genotypes
    cmd = """bcftools query -r %s:%s-%s -f"[%%SAMPLE\\t%%GB\\n]" %s | \
    grep -v "\." | sed 's/|/\\t/' | awk '{print $1 "\\t" $2+$3}' > str_genotypes.tab"""%(chrom, start, start, vcf)
    os.system(cmd)
    # Pull out STR genotypes - alleles
    cmd = """bcftools query -r %s:%s-%s -f"[%%SAMPLE\\t%%GB\\n]" %s | \
    grep -v "\." | sed 's/|/\\t/' | awk '{print $1 "\\t" $2","$3}' > str_genotypes_alleles.tab"""%(chrom, start, start, vcf)
    os.system(cmd)
    # Pull out gene expression
    expr="/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/%s/Corr_Expr.csv"%tissue
    colnum = open(expr,"r").readline().split(",").index('"' + gene + '"')+2
    cmd = """cat %s | cut -d',' -f 1,%s | grep -v ENSG > expr.tab"""%(expr, colnum)
    os.system(cmd)

def PlotESTR(gene, tissue, chrom, start, COLORS):
    strgt = pd.read_csv("str_genotypes.tab", sep="\t", names=["sample","str"])
    strgt["sample"] = strgt["sample"].apply(lambda x: "-".join(x.split("-")[0:2]))
    
    rmgts = []
    for gt in sorted(list(set(strgt["str"]))):
        if strgt[strgt["str"]==gt].shape[0] <3: rmgts.append(gt)
    strgt = strgt[strgt["str"].apply(lambda x: x not in rmgts)]
        
    fig = plt.figure()
    ax = fig.add_subplot(111)
    expr = pd.read_csv("expr.tab", names=["sample","expr"])
    data = pd.merge(strgt, expr)
    sns.swarmplot(x="str", y="expr", ax=ax, data=data, color=COLORS[tissue])
    ax.set_xlabel("bp (rel. to hg19)", size=15)
    ax.set_ylabel("Expression - %s"%tissue, size=15)
    ax.set_xticklabels([int(item) for item in sorted(list(set(data["str"])))], size=12)
    ax.set_yticklabels(["%.2f"%(item) for item in ax.get_yticks()], size=12)
    ax.set_title("")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    ax.set_title("chr%s:%s - %s"%(chrom, start, gene))
    plt.suptitle("");
    plt.show()
    fig.savefig("%s_estr.pdf"%gene)

def PlotHeatmap():
    strgt = pd.read_csv("str_genotypes_alleles.tab", sep="\t", names=["sample","GB"])
    strgt["sample"] = strgt["sample"].apply(lambda x: "-".join(x.split("-")[0:2]))
    strgt["str"] = strgt["GB"].apply(lambda x: sum([int(item) for item in x.split(",")]))
    rmgts = []
    for gt in sorted(list(set(strgt["str"]))):
        if strgt[strgt["str"]==gt].shape[0] <3: rmgts.append(gt)
    strgt = strgt[strgt["str"].apply(lambda x: x not in rmgts)]

    expr = pd.read_csv("expr.tab", names=["sample","expr"])
    correxpr = pd.merge(strgt, expr)
    # Heatmap
    alleles = set()
    for i in range(correxpr.shape[0]):
        a1, a2 = correxpr["GB"].values[i].split(",")
        alleles.add(int(a1))
        alleles.add(int(a2))
    alleles = sorted(list(alleles))

    exprmatrix = np.zeros((len(alleles), len(alleles)))
    counts = np.zeros((len(alleles), len(alleles)))
    for i in range(correxpr.shape[0]):
        a1, a2 = correxpr["GB"].values[i].split(",")
        a1 = int(a1)
        a2 = int(a2)
        if a1<a2:
            a1ind = alleles.index(a1)
            a2ind = alleles.index(a2)
        else: 
            a1ind = alleles.index(a2)
            a2ind = alleles.index(a1)
        exprmatrix[a1ind,a2ind] += correxpr["expr"].values[i]
        exprmatrix[a2ind,a1ind] += correxpr["expr"].values[i]
        counts[a1ind,a2ind] += 1
        counts[a2ind,a1ind] += 1
    exprmatrix = exprmatrix/(counts+1)

    sns.heatmap(exprmatrix, xticklabels=alleles, yticklabels=alleles)
    
    # Plot each row of the matrix
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for i in range(exprmatrix.shape[0]):
        vals = exprmatrix[i,:]
        if sum(counts[i,:]) < 25: continue
        nvals = []
        for item in vals:
            if item == 0: nvals.append(None)
            else: nvals.append(item)
        ax.plot(alleles, nvals, label=alleles[i])
    ax.legend()

%store COLORS
%store SHORTEN
%store TISSUES

Stored 'COLORS' (dict)
Stored 'SHORTEN' (dict)
Stored 'TISSUES' (list)
