In [37]:
# Input mutation results
MUTFILE = "/storage/ileena/ssc-gangstr-denovos/denovos_GW_priors_Jan20/combined/SSC_allphases_011720_denovos_GW_priors_Jan20.final_qc_mutations.tab"
NAIVEMUTFILE = "/storage/ileena/ssc-gangstr-denovos/denovos_naive_Jan20/SSC_allphases_011720_denovos_naive_Jan20.final_qc_mutations.tab"

MUTFILEX = "/storage/ileena/ssc-gangstr-denovos/denovos_GW_priors_Jan20/SSC_allphases_072820_denovos_model_X.denovos.final_qc_mutations.tab"
NAIVEMUTFILEX = "/storage/ileena/ssc-gangstr-denovos/denovos_naive_Jan20/SSC_allphases_072820_denovos_naive_X.denovos.final_qc_mutations.tab"


# Imports
import numpy as np
import os
import pandas as pd
import scipy.stats

# Other input data
AGEFILE = "/storage/ileena/denovos4/metadata/ssc_4phases_ages.csv"

# Load main mutations data
alldata = pd.read_csv(MUTFILE, sep="\t")
usefams = set(alldata["family"])

alldata_x = pd.read_csv(MUTFILEX, sep="\t")
alldata_x = alldata_x[alldata_x.family.isin(usefams)]

# Load expansions identified using naive method
ndata = pd.read_csv(NAIVEMUTFILE, sep="\t")
ndata = ndata[(ndata["posterior"]==-1) & (ndata["mutsize"]>=5)] # moderately big expansion events
# Filter ones that occur too many times
expcounts = ndata.groupby(["chrom","pos"], as_index=False).agg({"child": len})
ndata = pd.merge(ndata, expcounts[expcounts["child"]<=3][["chrom","pos"]], on=["chrom","pos"])

# Load expansions - X
ndata_x = pd.read_csv(NAIVEMUTFILEX, sep="\t")
ndata_x = ndata_x[(ndata_x["posterior"]==-1) & (ndata_x["mutsize"]>=5)] # moderately big expansion events
# Filter ones that occur too many times
expcounts = ndata_x.groupby(["chrom","pos"], as_index=False).agg({"child": len})
ndata_x = pd.merge(ndata_x, expcounts[expcounts["child"]<=3][["chrom","pos"]], on=["chrom","pos"])

# Concatenate expansions
ndata = ndata[ndata.family.isin(usefams)]
alldata = pd.concat([alldata, ndata]).drop_duplicates()

ndata_x = ndata_x[ndata_x.family.isin(usefams)]
alldata_x = pd.concat([alldata_x, ndata_x]).drop_duplicates()

# Manually remove remaining problematic families.
# 14151 proband had 130 mutations
# other families have <20 in both proband and sibling
rmfams = [14151, 12434, 12281, 13673, 13351, 13355, 13143]

alldata = alldata[~alldata.family.isin(rmfams)]
alldata_x = alldata_x[~alldata_x.family.isin(rmfams)]

ages = pd.read_csv(AGEFILE)
ages["family"] = ages["fam_id"]
ages["child"] = ages["sample_id"]
alldata_x = pd.merge(alldata_x, ages[["child","sex"]], how="left")


In [38]:
# Number of unique families
print("Number of unique families: %s"%len(set(alldata["family"])))

# Total number of mutations
print("Number of total mutations (autosomes): %s"%alldata.shape[0])
# Total number of mutations - autosomes + chrX 
print("Number of total mutations (autosomes+chrX): %s"%(alldata.shape[0]+alldata_x.shape[0]))


# Total number of distinct loci - autosomes 
print("Number of distinct autosomal loci: %s"%(alldata[["chrom","pos"]].drop_duplicates().shape[0]))
# Total number of distinct loci - autosomes + chrX
print("Number of distinct loci (autosomes+chrX): %s"%(alldata[["chrom","pos"]].drop_duplicates().shape[0]+
                                               alldata_x[["chrom","pos"]].drop_duplicates().shape[0]))


Number of unique families: 1593
Number of total mutations (autosomes): 171578
Number of total mutations (autosomes+chrX): 174330
Number of distinct autosomal loci: 91925
Number of distinct loci (autosomes+chrX): 93880


In [47]:
denovosfn="/storage/ileena/SFARI_SSC_DENOVOS/Mitra_etal_SFARI_SSC_denovo_TRs_Nov2020.csv"
out = pd.concat([alldata, alldata_x], axis=0, ignore_index=True, sort=False)
outcols = ['chrom', 'pos', 'period', 'prior', 'family', 'child', 'phenotype',
       'posterior', 'newallele', 'mutsize',  'poocase','child_gt', 'mat_gt', 'pat_gt',
       'encl_child', 'encl_mother', 'encl_father', 'encl_parent',
       'long_mother', 'long_father', 'phase']
out[outcols].to_csv(denovosfn, index=False, quoting=None)


In [68]:
ped = pd.read_csv("/storage/ileena/denovos4/metadata/ssc_family_phases.txt", sep="\t", names=["family", "sampleid", "phase"])
ped = ped[ped.family.isin(out.family.unique())]
out["sampleid"] = out.child
sampleids  = pd.merge(ped, out[["family", "sampleid", "phase"]], 
                      how="left", 
                      on=["family", "sampleid", "phase"])
sampleids["child"] = sampleids.sampleid.isin(out.child.unique())
sampleids.to_csv("/storage/ileena/SFARI_SSC_DENOVOS/Mitra_etal_SFARI_SSC_SampleIDs_Nov2020.csv", header=True, index=False, quoting=None)

array([nan], dtype=object)