In [11]:
# Input mutation results
MUTFILE = "/storage/ileena/ssc-gangstr-denovos/denovos_GW_priors_Jan20/combined/SSC_allphases_011720_denovos_GW_priors_Jan20.final_qc_mutations.tab"
NAIVEMUTFILE = "/storage/ileena/ssc-gangstr-denovos/denovos_naive_Jan20/SSC_allphases_011720_denovos_naive_Jan20.final_qc_mutations.tab"

MUTFILEX = "/storage/ileena/ssc-gangstr-denovos/denovos_GW_priors_Jan20/SSC_allphases_072820_denovos_model_X.denovos.final_qc_mutations.tab"
NAIVEMUTFILEX = "/storage/ileena/ssc-gangstr-denovos/denovos_naive_Jan20/SSC_allphases_072820_denovos_naive_X.denovos.final_qc_mutations.tab"

# Make editable in Illustrator
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# Imports
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats

# Other input data
AGEFILE = "/storage/ileena/denovos4/metadata/ssc_4phases_ages.csv"
REFBED = "/storage/resources/dbase/human/hg38_ver16_chrX.bed"

# Output path info
OUTDIR = "/storage/ileena/pdfs/"


# Load main mutations data
alldata = pd.read_csv(MUTFILE, sep="\t")
usefams = set(alldata["family"])
alldata = pd.concat([alldata,pd.read_csv(MUTFILEX, sep="\t")], sort=False)
alldata = alldata[alldata.family.isin(usefams)]


# Load expansions identified using naive method
# Load expansions identified using naive method
ndata = pd.concat([pd.read_csv(NAIVEMUTFILE, sep="\t"),
                   pd.read_csv(NAIVEMUTFILEX, sep="\t")], sort=False)
ndata = ndata[(ndata["posterior"]==-1) & (ndata["mutsize"]>=5)].copy() # moderately big expansion events

# Filter ones that occur too many times
expcounts = ndata.groupby(["chrom","pos"], as_index=False).agg({"child": len})
ndata = pd.merge(ndata, expcounts[expcounts["child"]<=3][["chrom","pos"]], on=["chrom","pos"])


# Concatenate expansions
ndata = ndata[ndata.family.isin(usefams)]
alldata = pd.concat([alldata, ndata]).drop_duplicates()

# Manually remove remaining problematic families.
# 14151 proband had 130 mutations
# other families have <20 in both proband and sibling
rmfams = [14151, 12434, 12281, 13673, 13351, 13355, 13143]
alldata = alldata[~alldata.family.isin(rmfams)]



# Motif data
ref = pd.read_csv(REFBED, header=None, names=["chr", "pos", "end", "period", "motif"], sep="\t")
ref["chrom"] = "X"



In [12]:
### Autosomal
alldata[alldata.chrom!="X"].shape



(171578, 26)

In [13]:
### Chr X
x = alldata[(alldata.chrom=="X") & (alldata["posterior"]==-1) & (alldata["mutsize"]>=5)]
x = pd.merge(x, ref, how="left", on=["chrom", "pos", "period"])
x = x[["phenotype", "chrom", "pos", "mutsize", "motif"]].sort_values(["phenotype", "chrom", "pos"])
# x.to_csv("/storage/ileena/denovos5/expansions_x_073020.csv", index=False, header=True)
x

Unnamed: 0,phenotype,chrom,pos,mutsize,motif
5,1,X,22249209,7,GT
8,1,X,31284558,8,TTC
3,1,X,32199781,8,TG
1,1,X,32948114,9,AC
2,1,X,71072907,9,AC
6,1,X,135306289,5,AC
0,1,X,143068091,16,GA
7,2,X,18509092,7,AC
9,2,X,21728215,10,TC
4,2,X,96775354,10,TG
