In [19]:
%pylab inline

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from matplotlib.colors import LogNorm
import os
import seaborn as sns
import scipy.stats

import pandas as pd

dfs = []
for chrom in range(1, 23):
    if not os.path.exists("chr%s.period.tab"%chrom): continue
    data = pd.read_csv("chr%s.stats.tab"%chrom, sep="\t")
    dfs.append(data)
data = pd.concat(dfs)

motifs = pd.read_csv("motif_info.tab", sep="\t", names=["chrom", "start","period","motif"])
data = pd.merge(data, motifs, on=["chrom","start"])
data["reflen"] = data.apply(lambda x: (x["end"]-x["start"]+1)/x["period"], 1)
data["reflen-bp"] = data.apply(lambda x: (x["end"]-x["start"]+1), 1)

Populating the interactive namespace from numpy and matplotlib


In [20]:
data = data[(data["motif"].apply(type)==str)]

In [24]:
# Subset to chr1 to make it faster. trends should be the same
data = data[data["chrom"]=="chr1"]

In [None]:
# Plot reflen vs. het by period, overall and for each population

def GetMatchesMotif(motif, motiflist):
    for m in motiflist:
        if m == "ALL": return True
        elif type(m) == int or type(m) == float:
            if len(motif)==m: return True
        elif m == "vntr":
            if len(motif)> 6: return True
        else:
            if m == motif: return True
    return False
    
def PlotLenVsHet(data, lencol="reflen", pop=None, motifs=None, colors=None, figname=None):
    """
    Plot length vs. het
    If pop=None, use het-ALL. else use het-${pop}
    motifs is a list of motifs. or 1, 2, 3 etc. for
    all motifs with that length. Use "vntr" for 7+
    """
    if lencol == "reflen":
        MINLEN = 10
        MAXLEN = 30
    else:
        MINLEN = 10
        MAXLEN = 100
    if pop is None:
        col = "het-ALL"
    else: col = "het-%s"%pop
    fig = plt.figure()
    ax = fig.add_subplot(111)
    if motifs is None:
        motifs = ["ALL"]
        colors = ["black"]
    assert(len(motifs)==len(colors))
    for i in range(len(motifs)):
        m = motifs[i]
        lenbins = range(MINLEN, MAXLEN+1)
        if lencol == "reflen":
            lenbins = np.arange(MINLEN, MAXLEN, 1)
        else:
            if (m in [1,2,3] or (type(m)==str and len(m) <3)) and m != "vntr" and (m != "ALL"):
                lenbins = range(MINLEN, MAXLEN+1)
            else:
                lenbins = np.arange(MINLEN, MAXLEN, 1)  # could go lower resolution here if we want
        color = colors[i]
        xvals = []
        yvals = []
        for j in range(len(lenbins)-1):
            lb = lenbins[j]
            ub = lenbins[j+1]
            xx = data[(data[lencol]>=lb) & (data[lencol]<ub) &
                                      (data["motif"].apply(lambda x: GetMatchesMotif(x, [m]), 1))]
            if xx.shape[0] > 20:
                xvals.append(lb)
                yvals.append(np.mean(xx[col]))
        ax.plot(xvals, yvals, color=color)
    if figname is not None: fig.savefig(figname)
PlotLenVsHet(data, motifs=[1,2,3,4,5,6,"vntr"],
             lencol="reflen-bp", colors=["gray","red","gold","blue","green","purple","brown"],
            figname="LengthVsHet-bp-ALL.pdf");

In [None]:
for pop in ["EUR","AMR","SAS","EAS","AFR","H3AFR"]