### The purpose of this notebook is to unify gene symbols. The reasons are: 
- Our CRISPR library uses gene symbols that are mostly consistent with HGNC gene symbols, but some gene symbols had been renamed in HGNC
- All of our ATACseq and RNAseq data are aligned to Gencode hg38 v31, thus the gene symbols are mostly ensembl names, and they are not consistent with HGNC
- JASPAR 2020 gene symbols are consistent with HGNC approved gene symbols

### Thus, all gene symbols (where possible) are unified to HGNC approved symbols.

### Note: not all gene symboles can be mapped to HGNC. Notably about 1000 Gencode genes are not mappable, and are kept with its original Gencode symbol.

- downloaded **HGNC symbols**
`/c/groups/sanjana_lab/cdai/TFscreen/gene_names_lookup.txt`

- symbol updated **Hit** list genes: 
`/c/groups/sanjana_lab/cdai/TFscreen/Hitlist_20191230.csv`

- symbol updated **TF** list genes:
`/c/groups/sanjana_lab/cdai/TFscreen/TFlist_20191230.csv`

- symbol updated **gene annotation**:
`/c/groups/sanjana_lab/cdai/TFscreen/Protein_coding_genes_Up_2k_20191230.bed`

In [4]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [4]:
setwd("/c/groups/sanjana_lab/cdai/TFscreen/")

In [5]:
list.files(".", ".txt")

`gene_names_lookup.txt` was downloaded from HGNC genenames.org

In [6]:
# HGNC gene symbols, downloaded from genenames.org)
suppressMessages(gene.name.lookup <- read_delim("/c/groups/sanjana_lab/cdai/TFscreen/gene_names_lookup.txt", delim="\t"))
names(gene.name.lookup) <- names(gene.name.lookup) %>% str_replace_all(" ", "_")

---

## Fix Hitlist

In [417]:
hitlist <- read.csv('/c/groups//sanjana_lab/cdai/TFscreen/HS_td-VStd+_td-VSdox120.csv', header = F, stringsAsFactors = F) %>% pull
hitlist <- c(hitlist, c('NEUROG1', 'NEUROG2')) # adding NERUOG1/2 to hit list

In [418]:
hitlist_in <- hitlist[hitlist %in% gene.name.lookup$Approved_symbol]
hitlist_out <- hitlist[! hitlist %in% gene.name.lookup$Approved_symbol]

In [427]:
# rematch genes that have changed names in hgnc
hitlist_out_matched <- map(hitlist_out, ~ filter(gene.name.lookup, str_detect(Previous_symbols, paste0("^", .x, "$")) | 
                        str_detect(Previous_symbols, paste0("[, ]{1}", .x, "$")) | 
                        str_detect(Previous_symbols, paste0("^", .x, "[, ]{1}")) |
                        str_detect(Previous_symbols, paste0("[, ]{1}", .x, "[, ]{1}")))$Approved_symbol[1]) %>% unlist

# save a new copy of tf names
hitlist <- rbind(data.frame("gene_name" = hitlist_in, "hgnc_symbol" = hitlist_in, stringsAsFactors=F), data.frame("gene_name" = hitlist_out, "hgnc_symbol" = hitlist_out_matched, stringsAsFactors=F)) %>% 
    filter(!is.na(hgnc_symbol))

---

## Fix TFlist gene names, these are the genes in CRISPR screen data

In [405]:
tflist <- read.csv('TFlist_20191118.csv', stringsAsFactors = F, col.names='gene_name') %>% pull(gene_name)

# find out which genes have a match in HGNC symbol
tflist_in <- tflist[tflist %in% gene.name.lookup$Approved_symbol]
tflist_out <- tflist[! tflist %in% gene.name.lookup$Approved_symbol]

# rematch genes that have changed names in hgnc
tflist_out_matched <- map(tflist_out, ~ filter(gene.name.lookup, str_detect(Previous_symbols, paste0("^", .x, "$")) | 
                        str_detect(Previous_symbols, paste0("[, ]{1}", .x, "$")) | 
                        str_detect(Previous_symbols, paste0("^", .x, "[, ]{1}")) |
                        str_detect(Previous_symbols, paste0("[, ]{1}", .x, "[, ]{1}")))$Approved_symbol[1]) %>% unlist

# save a new copy of tf names
tflist <- rbind(data.frame("gene_name" = tflist_in, "hgnc_symbol" = tflist_in, stringsAsFactors=F), data.frame("gene_name" = tflist_out, "hgnc_symbol" = tflist_out_matched, stringsAsFactors=F)) %>% 
    filter(!is.na(hgnc_symbol))

---

## Fix protein coding gene annotation bed files

In [439]:
gene_region <- read.table("atac/Protein_coding_genes_Up_2k.bed", header = F, stringsAsFactors = F, 
                              col.names = c("seqname","start","end","gene_id","gene_name","strand")) %>% 
                                dplyr::select(seqname, start, end, strand, gene_id, gene_name)

In [440]:
# remove ".\d+[_A-Z]*" from gene_id
gene_region[['gene_id2']] <- gene_region$gene_id %>% str_remove(., "[\\.]+[0-9]+[_A-Z]*")

In [441]:
# remove genes with "[_A-Z]" in gene_id
gene_region <- filter(gene_region, ! str_detect(gene_id, "_PAR"))

In [442]:
# left_join hgnc gene symbols, if there's a match based on gene_id then use HGNC approved gene symbol, 
# otherwise use original GENCODE gene symbol
gene_region <- left_join(gene_region, gene.name.lookup[, c(1,5)], by = c("gene_id2" = "Ensembl_gene_ID")) %>% 
    mutate("hgnc_symbol" = if_else(is.na(Approved_symbol), gene_name, Approved_symbol)) 

In [443]:
gene_region %>% str

'data.frame':	19944 obs. of  9 variables:
 $ seqname        : chr  "chr1" "chr1" "chr1" "chr1" ...
 $ start          : int  63419 450703 685679 921928 944203 958584 964497 975204 998962 999138 ...
 $ end            : int  71585 453697 688673 944581 961309 965719 975865 984093 1002172 1014540 ...
 $ strand         : chr  "+" "-" "-" "+" ...
 $ gene_id        : chr  "ENSG00000186092.6" "ENSG00000284733.1" "ENSG00000284662.1" "ENSG00000187634.12" ...
 $ gene_name      : chr  "OR4F5" "OR4F29" "OR4F16" "SAMD11" ...
 $ gene_id2       : chr  "ENSG00000186092" "ENSG00000284733" "ENSG00000284662" "ENSG00000187634" ...
 $ Approved_symbol: chr  "OR4F5" "OR4F29" "OR4F16" "SAMD11" ...
 $ hgnc_symbol    : chr  "OR4F5" "OR4F29" "OR4F16" "SAMD11" ...


---

## Fix Gencode GTF data frame annotation

In [5]:
setwd("/c/groups/sanjana_lab/cdai/TFscreen/atac/annotations")

In [16]:
gencode_gtf <- read.table("/c/groups/sanjana_lab/cdai/ref_genome/gencode.v31.primary_assembly.annotation.pandas.df.txt", header=T, sep="\t", stringsAsFactors=F)

In [7]:
head(gencode_gtf)

Unnamed: 0_level_0,seqname,source,feature,start,end,score,strand,frame,gene_id,gene_type,⋯,transcript_type,transcript_name,transcript_support_level,tag,havana_transcript,exon_number,exon_id,ont,protein_id,ccdsid
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<int>,<lgl>,<chr>,<int>,<chr>,<chr>,⋯,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>
1,chr1,HAVANA,gene,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,,,,,,,,,,
2,chr1,HAVANA,transcript,11869,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,,,,,
3,chr1,HAVANA,exon,11869,12227,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,1.0,ENSE00002234944.1,,,
4,chr1,HAVANA,exon,12613,12721,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,2.0,ENSE00003582793.1,,,
5,chr1,HAVANA,exon,13221,14409,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,lncRNA,DDX11L1-202,1.0,basic,OTTHUMT00000362751.1,3.0,ENSE00002312635.1,,,
6,chr1,HAVANA,transcript,12010,13670,,+,0,ENSG00000223972.5,transcribed_unprocessed_pseudogene,⋯,transcribed_unprocessed_pseudogene,DDX11L1-201,,basic,OTTHUMT00000002844.2,,,"PGO:0000005,PGO:0000019",,


In [17]:
gencode_gtf <- filter(gencode_gtf, ! str_detect(gene_id, "_PAR"))
gencode_gtf <- mutate(gencode_gtf, "gene_id2" = str_remove(gene_id,  "[\\.]+[0-9]+[_A-Z]*"))
gencode_gtf <- left_join(gencode_gtf, gene.name.lookup[, c(1,5)], by = c("gene_id2"="Ensembl_gene_ID")) %>%
        mutate("hgnc_symbol" = if_else(is.na(Approved_symbol), gene_name, Approved_symbol))

In [19]:
write.table(gencode_gtf, "/c/groups/sanjana_lab/cdai/ref_genome/gencode.v31.primary_assembly.annotation.pandas.df.txt", quote = F, sep = "\t", row.names = F, col.names = T)

---

## Fix gene names used for RNA seq analysis

In [478]:
# remove ".\d+[_A-Z]*" from gene_id
gtf_lookup[['gene_id2']] <- gtf_lookup$gene_id %>% str_remove(., "[\\.]+[0-9]+[_A-Z]*")

# remove genes with "[_A-Z]" in gene_id
gtf_lookup <- filter(gtf_lookup, ! str_detect(gene_id, "_PAR"))

# left_join hgnc gene symbols, if there's a match based on gene_id then use HGNC approved gene symbol, 
# otherwise use original GENCODE gene symbol
gtf_lookup <- left_join(gtf_lookup, gene.name.lookup[, c(1,5)], by = c("gene_id2" = "Ensembl_gene_ID")) %>% 
    mutate("hgnc_symbol" = if_else(is.na(Approved_symbol), gene_name, Approved_symbol)) 

gtf_lookup <- gtf_lookup %>% select(hgnc_symbol, gene_id) %>% rename("gene_name"="hgnc_symbol")

In [485]:
write.table(gtf_lookup, "gencode_refseq_partial_modified_geneNames_ID_20191230.csv", sep=",", quote=F, row.names=F, col.names=T)

---

*The End*