# Fetch genes by gene name or by keyword (disease)

# Libraries

In [None]:
# Load the biomaRt package
library(biomaRt)
library(dplyr)
library(stringr)




Attaching package: ‘dplyr’


The following object is masked from ‘package:biomaRt’:

    select


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




# Get genes from Ensembl - search by name

Connect to the Ensembl BioMart and list available filters and attibutes to find one related to phenotypes or disease descriptions:



In [None]:
# Connect to  Ensembl BioMart 
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") 

In [3]:
# List available filters
filters <- listFilters(ensembl)
# filters

# Search for filters containing the word "description"
description_filters <- grep("phenotype", filters$name, value = TRUE)

# List available attributes
attributes_all <- listAttributes(ensembl)


Search ensembl for particular genes

In [4]:
# Define the keyword for ensembl gene id: 
arthritis_keyword <- "ENSG00000236346"

# Construct the query to fetch genes related to arthritis
genes_arthritis <- getBM(
  attributes = c(
   "hgnc_symbol", 
   "ensembl_gene_id", 
   "description", 
   "chromosome_name", 
   "start_position", 
   "end_position", 
   "strand",
   "phenotype_description"
  ),
  # attributes = attributes_all[30:100,1] , 
  filters = "ensembl_gene_id", 
  values = arthritis_keyword, 
  mart = ensembl
)


In [5]:
# Print the first few rows of the data
genes_arthritis

hgnc_symbol,ensembl_gene_id,description,chromosome_name,start_position,end_position,strand,phenotype_description
<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>
NFKBIL1,ENSG00000236346,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_SSTO_CTG1,2846154,2858110,1,Rheumatoid arthritis


# Get genes from Ensembl - search by keyword

The keyword can be disease name. There are two options:  A. ensembl filter option. B. Fetching whole human geneset and use "dplyr and stringr" to explore geneset. 

### A. Standard filter option

In [6]:
# Define the keyword for arthritis
arthritis_keyword <- c("Rheumatoid arthritis", "Osteoarthritis")

# Construct the query to fetch genes related to arthritis
genes_arthritis <- getBM(
  attributes = c(
   "hgnc_symbol", 
   "ensembl_gene_id", 
   "description", 
   "chromosome_name", 
   "start_position", 
   "end_position", 
   "strand",
   "phenotype_description"
  ),
  # attributes = attributes_all[30:100,1] , 
  filters = "phenotype_description", 
  values = arthritis_keyword, 
  mart = ensembl
)


In [7]:
genes_arthritis

hgnc_symbol,ensembl_gene_id,description,chromosome_name,start_position,end_position,strand,phenotype_description
<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>
NFKBIL1,ENSG00000236346,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_SSTO_CTG1,2846154,2858110,1,Rheumatoid arthritis
NFKBIL1,ENSG00000227565,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_MANN_CTG1,2851919,2863876,1,Rheumatoid arthritis
NFKBIL1,ENSG00000234530,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_COX_CTG1,3024196,3036142,1,Rheumatoid arthritis
NFKBIL1,ENSG00000235125,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_MCF_CTG1,2888805,2900754,1,Rheumatoid arthritis
NFKBIL1,ENSG00000206440,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_QBL_CTG1,2802722,2814673,1,Rheumatoid arthritis
NFKBIL1,ENSG00000236196,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],HSCHR6_MHC_DBB_CTG1,2794636,2806586,1,Rheumatoid arthritis
NFKBIL1,ENSG00000204498,NFKB inhibitor like 1 [Source:HGNC Symbol;Acc:HGNC:7800],6,31546870,31558829,1,Rheumatoid arthritis
CIITA,ENSG00000179583,class II major histocompatibility complex transactivator [Source:HGNC Symbol;Acc:HGNC:7067],16,10866222,10943021,1,Rheumatoid arthritis
SLC22A4,ENSG00000197208,solute carrier family 22 member 4 [Source:HGNC Symbol;Acc:HGNC:10968],5,132294394,132344190,1,Rheumatoid arthritis
IL10,ENSG00000136634,interleukin 10 [Source:HGNC Symbol;Acc:HGNC:5962],1,206767602,206774541,-1,Rheumatoid arthritis


### B. Get whole gene set and filter with R dplyr methods

In [8]:
# filter from whole gene list with dplyr and stringr
genes_arthritis <- getBM(
  attributes = c(
   "hgnc_symbol", 
   "ensembl_gene_id", 
   "description", 
   "chromosome_name", 
   "start_position", 
   "end_position", 
   "strand",
   "phenotype_description"
  ),
  mart = ensembl
)

# Define the keyword for arthritis
arthritis_keyword <- "arthritis"

# Filter the data frame using dplyr and stringr
genes_arthritis_filtered <- genes_arthritis %>%
  filter(str_detect(phenotype_description, arthritis_keyword))

# Print the filtered results
print(genes_arthritis_filtered)


   hgnc_symbol ensembl_gene_id
1        PTPN2 ENSG00000175354
2        PTPN2 ENSG00000175354
3        LACC1 ENSG00000179630
4       IL10RB ENSG00000243646
5          MIF ENSG00000240972
6        IL2RB ENSG00000100385
7        IL2RB ENSG00000100385
8         ACAN ENSG00000157766
9         ACAN ENSG00000157766
10       RIPK1 ENSG00000137275
11     NFKBIL1 ENSG00000236346
12     NFKBIL1 ENSG00000227565
13       IL2RA ENSG00000134460
14       IL2RA ENSG00000134460
15     NFKBIL1 ENSG00000234530
16      COL2A1 ENSG00000139219
17     NFKBIL1 ENSG00000235125
18     NFKBIL1 ENSG00000206440
19     NFKBIL1 ENSG00000236196
20     ANKRD55 ENSG00000164512
21     ANKRD55 ENSG00000164512
22       HLA-B ENSG00000234745
23     NFKBIL1 ENSG00000204498
24       CIITA ENSG00000179583
25         TNF ENSG00000232810
26         IL6 ENSG00000136244
27    HLA-DRB1 ENSG00000196126
28    HLA-DRB1 ENSG00000196126
29       SMAD3 ENSG00000166949
30     SLC22A4 ENSG00000197208
31      IL10RA ENSG00000110324
32      