# Make final table of regulatory genes 

In [360]:
setwd("/share/crsp/lab/seyedam/share/enc4_mouse/snrna/ref")

## Subset miRNA-host table
Spearman correlation >= 0.3, keep any annotated miRNA-HGs and remove any gene model miRNAs passing the >= 0.3 filter.

In [361]:
mirna_host = read.csv("mirna_hostGene_correlation.csv",row.names = 1)
mirna_host_corr = mirna_host[mirna_host$spearman_correlation >= 0.3|grepl("Mir.+hg$",mirna_host$host_gene_name),]
mirna_host_corr = na.omit(mirna_host_corr)

length(unique(mirna_host_corr$id)) # 220 miRNA-host links pass filter
length(unique(mirna_host_corr$host_gene_name)) # 186 unique host genes


## Remove duplicate miRNA-host links
Keep the one with most correlation.

In [362]:
mirna_host_corr_uniq = mirna_host_corr[!(duplicated(mirna_host_corr$host_gene_name)|duplicated(mirna_host_corr$host_gene_name, fromLast=TRUE)),]
mirna_host_corr_dup = mirna_host_corr[duplicated(mirna_host_corr$host_gene_name),]
mirna_host_corr_dup = mirna_host_corr_dup[order(mirna_host_corr_dup$host_gene_name, -mirna_host_corr_dup$spearman_correlation), ] #sort by id and spearman corr
mirna_host_corr_dup = mirna_host_corr_dup[!duplicated(mirna_host_corr_dup$host_gene_name), ]
mirna_host_corr = rbind(mirna_host_corr_uniq,mirna_host_corr_dup)

length(unique(mirna_host_corr$id)) # 220 miRNA-host links pass filter
length(unique(mirna_host_corr$host_gene_name)) # 186 unique host genes

mirna_host_corr$biotype = "MiRNA_HG"
mirna_host_corr$display_name = mirna_host_corr$mirna_name
mirna_host_corr$gene_name = mirna_host_corr$host_gene_name
mirna_host_corr = mirna_host_corr[,c("gene_name","display_name","biotype")]

### Also remove any Gm miRNAs.

In [363]:
mirna_host_corr = mirna_host_corr[!grepl("^Gm",mirna_host_corr$display_name),]
dim(mirna_host_corr)

## Load GO term table

In [364]:
histone_regulators = read.delim("histone_regulators.tsv")
histone_regulators$display_name = histone_regulators$gene_name
histone_regulators$biotype = histone_regulators$Type
histone_regulators = histone_regulators[,c("gene_name","display_name","biotype")]


## Load TF table

In [365]:
tfs = read.delim("TF_mouse_GRCm39.txt")
tfs$biotype = "TF"
tfs$display_name = tfs$Gene.name
tfs$gene_name = tfs$Gene.name
tfs = unique(tfs[,c("gene_name","display_name","biotype")])

# Merge tables

In [367]:
mirna_host_corr = mirna_host_corr[!(mirna_host_corr$gene_name %in% tfs$gene_name),]
histone_regulators = histone_regulators[!(histone_regulators$gene_name %in% tfs$gene_name),]

display_table = rbind(tfs, mirna_host_corr, histone_regulators)
display_table = unique(display_table)

In [369]:
write.csv(display_table,file="new_gene_display_table.csv",row.names=F)

In [371]:
old = read.csv("gene_display_table.csv")

In [372]:
length(unique(old$gene_name))

In [373]:
table(unique(old$gene_name) %in% unique(display_table$gene_name))

table(unique(display_table$gene_name) %in% unique(old$gene_name))


FALSE  TRUE 
   23  2273 


FALSE  TRUE 
  516  2273 

In [374]:
display_table_new = display_table[!(display_table$gene_name %in% old$gene_name),]

In [375]:
table(display_table_new$biotype)


      Chromatin_binding     Chromatin_component  Chromatin_organization 
                     21                      51                      68 
                    HAT                MiRNA_HG                     TAF 
                      1                      23                       2 
                     TF Transcription_regulator 
                     84                     305 

In [376]:
write.csv(display_table_new,file="not_in_old_gene_display_table.csv",row.names=F)

In [377]:
old[grep("Id",old$gene_name),]
display_table[grep("Id",display_table$gene_name),]

gene_name,display_name,biotype
<chr>,<chr>,<chr>


Unnamed: 0_level_0,gene_name,display_name,biotype
Unnamed: 0_level_1,<chr>,<chr>,<chr>
35751,Id1,Id1,Transcription_regulator
3576,Id2,Id2,Transcription_regulator
3577,Id3,Id3,Transcription_regulator
3578,Id4,Id4,Transcription_regulator


In [380]:
display_table_new = read.csv("new_gene_display_table.csv")

In [381]:
table(display_table_new$biotype)


      Chromatin_binding     Chromatin_component  Chromatin_organization 
                    465                      59                     551 
                    HAT                    HDAC                     HDM 
                     28                      21                      26 
                    HMT                Mediator                MiRNA_HG 
                     53                      41                     174 
                    TAF                      TF Transcription_regulator 
                    106                    1357                     522 