## Load in Data

In [1]:
relations_filename         = '../data/family_summaries.tsv'
pheno_filename             = '../data/pheno_partial.tsv'
raw_filename               = '../data/16p12_lcl_gene_reads_underscores.gct'
mapping_filename           = 'gene_map.tsv'


In [2]:
pheno                   = read.table(pheno_filename, sep='\t', header=TRUE, stringsAsFactors = F)
rownames(pheno)         = pheno$sample

In [3]:
relations           = read.table(relations_filename, sep='\t', header=TRUE, stringsAsFactors = F)
rownames(relations) = relations$subject

In [4]:
map             = read.table(mapping_filename, sep='\t', header=TRUE, stringsAsFactors = F)
rownames(map)   = map$ensembl

## Filter out unintersting genes

In [5]:
rawdf               = read.table(raw_filename, sep='\t', header=TRUE, stringsAsFactors = F)

In [6]:
gencode2ensembl = function(s) return(unlist((strsplit(s, '.', fixed=T)))[1])
rownames(rawdf) = unlist(lapply(rawdf$Name, gencode2ensembl))
rawdf$Name = NULL

In [7]:
low_expressed_genes = scan('genes.low_expression', what="", sep="\n")
sex_genes = scan('gtex_filter_sex_diff2.list', what="", sep="\n")
xgenes = as.character(map[map$chromosome == 'X' ,]$ensembl)
ygenes = as.character(map[map$chromosome == 'Y' ,]$ensembl)
length(low_expressed_genes)
length(xgenes)
length(ygenes)
length(sex_genes)

In [8]:
dim(rawdf)
# rawdf=rawdf[(rownames(rawdf) %in% goseq_genes),]
# dim(rawdf)
rawdf=rawdf[!(rownames(rawdf) %in% low_expressed_genes),]
dim(rawdf)
rawdf=rawdf[!(rownames(rawdf) %in% xgenes),]
dim(rawdf)
rawdf=rawdf[!(rownames(rawdf) %in% ygenes),]
dim(rawdf)
rawdf=rawdf[!(rownames(rawdf) %in% sex_genes),]
dim(rawdf)
rawmat              = as.matrix(rawdf)


## EdgeR

In [9]:
library(edgeR)

Loading required package: limma



In [11]:
group         = pheno$subject
y             = DGEList(counts=rawmat, group=group)
y             = calcNormFactors(y)
design        = model.matrix(~0+group)

In [None]:
y             = estimateDisp(y, design, robust=TRUE)
fit           = glmQLFit(y,design, robust=TRUE)


In [None]:
subject_map = function(design, subject) {
    subject = paste0('group', subject)
    pos = (1:length(colnames(design)))[colnames(design) == subject]
    return(pos)
}
common_name = function(s){
    s = map[map$ensembl == s, 'Description']
    return(s)
}
delta = function(d) {
    if (d < 0) {
        return('-')
    }
    if (d > 0){
        return('+')
    }
}
which_chrom = function(s) {
    s = as.character(map[map$ensembl == s, 'chromosome'])
    return(s)
}

In [None]:
for (subject1 in rownames(relations)) {

    if (subject1 == 'SG011') next
    for (parent in c('mother', 'father')) {
        subject2 = as.character(relations[subject1,parent])
        print(paste0(subject1, ' ', subject2))
        
        contrast = numeric(length(colnames(design)))
        contrast[subject_map(design, subject1)] = 1
        contrast[subject_map(design, subject2)] = -1
        
        qlf = glmQLFTest(fit,contrast=contrast)
        cdf = topTags(qlf, n=56202, p.value=0.05)$table
        print(dim(cdf))
        
        cdf$common = as.character(lapply(rownames(cdf), common_name))
        cdf$direction = as.character(lapply(cdf$logFC, delta))
        cdf$chromosome = as.character(lapply(rownames(cdf), which_chrom))
        cdf$gene_del = paste0(rownames(cdf), cdf$direction)
        
        diff_save = cbind(rownames(cdf), cdf)
        colnames(diff_save)[1] = "ensembl"
        outfile = paste0('output/diff_expression/', subject1, '_', subject2, '.tsv')
        write.table(diff_save, outfile, sep='\t', row.names=F, col.names=T)
        }
    }

## Make Three Lists

In [None]:
diff_dir = 'output/diff_expression/'
de_filename = function(de_dir, child, parent) {
    de_dir = paste0(de_dir, child, '_', parent, '.tsv')
    return(de_dir)
}

In [None]:
remove_del = function(s) {
    s = substr(s,1, 15)
    return(s)
}
pos_or_gen = function(s) {
    if (s > 0) { return('+') }
    if (s < 0) { return('-') }
}
get_direction = function(s, genes_de_novo_same_dir, cdf, ncdf) {
    if (s %in%  genes_de_novo_same_dir) {
        diff = cdf[rownames(cdf) == s, 'logFC']
        return(pos_or_gen(diff))
    }
    cdiff  = cdf[rownames(cdf) == s, 'logFC']
    cdiff  = pos_or_gen(cdiff)
    ncdiff = ncdf[rownames(ncdf) == s, 'logFC']
    ncdiff  = pos_or_gen(ncdiff)
    return(paste0(cdiff, "/", ncdiff))
}
save_table = function(df, outfile) {
    diff_save = cbind(rownames(df), df)
    colnames(diff_save)[1] = "ensembl"
    write.table(diff_save, outfile, sep='\t', row.names=F, col.names=T)
}


In [None]:
for (child in rownames(relations)) {
    if (child == 'SG011') next

    cp = as.character(relations[rownames(relations) == child,'carrier_parent'])
    ncp = as.character(relations[rownames(relations) == child,'non_carrier_parent'])
    print(paste0(child, ' ', cp, ' ', ncp))
    
    cp_filename = de_filename(diff_dir, child, cp)
    ncp_filename = de_filename(diff_dir, child, ncp)
    cdf = read.table(cp_filename, sep='\t', header=TRUE, row.names = 'ensembl')
    ncdf = read.table(ncp_filename, sep='\t', header=TRUE, row.names = 'ensembl')

    cdf = cdf[abs(cdf$logFC) > 0.5, ]
    ncdf = ncdf[abs(ncdf$logFC) > 0.5, ]
    
    genes_de_novo = intersect(rownames(cdf), rownames(ncdf))
    
    genes_de_novo_same_dir = intersect(cdf$gene_del, ncdf$gene_del)
    genes_de_novo_same_dir = as.character(lapply(genes_de_novo_same_dir, remove_del))
    
    dfi = data.frame(row.names = genes_de_novo)

    dfi$common = map$Description[match(rownames(dfi), map$ensembl)]
    dfi$direction_c_nc = as.character(lapply(genes_de_novo, get_direction, genes_de_novo_same_dir, cdf, ncdf))
    dfi$logFC_carrier_parent = cdf$logFC[match(rownames(dfi), rownames(cdf))]
    dfi$FDR_carrier_parent = cdf$FDR[match(rownames(dfi), rownames(cdf))]
    dfi$logFC_non_carrier_parent = ncdf$logFC[match(rownames(dfi), rownames(ncdf))]
    dfi$FDR_non_carrier_parent = ncdf$FDR[match(rownames(dfi), rownames(ncdf))]
    dfi$chromosome = map$chromosome[match(rownames(dfi), map$ensembl)]
    
    df_inh_from_nc =  cdf[!(rownames(cdf) %in% genes_de_novo),]
    df_inh_from_c  = ncdf[!(rownames(ncdf) %in% genes_de_novo),]

    df_inh_from_nc$gene_del = NULL
    df_inh_from_c$gene_del = NULL
    
    print(c(dim(dfi)[1], dim(df_inh_from_c)[1], dim(df_inh_from_nc)[1]))

    out_dir='output/three_lists/'
    outfile = paste0(out_dir, child, '.de_novo.tsv')
    save_table(dfi, outfile)
    outfile = paste0(out_dir, child, '.inherited_from_non_carrier.tsv')
    save_table(df_inh_from_nc, outfile)
    outfile = paste0(out_dir, child, '.inherited_from_carrier.tsv')
    save_table(df_inh_from_c, outfile)

}