# Dowload Enrichr database

Enrichr (https://maayanlab.cloud/Enrichr/help#api) does not have a single file download for the database.  So, we will make API requests, with delays, to download the database

# Libraries

In [22]:
# Libraries

# JSON
import json

# API requests
import requests

# Delay
import time

# API calls

Set the API URL and the query parameters.

In [23]:
import zipfile
import os

# Define the file paths
zip_file_path = '/home/mad1188/Enrichr_database/HPA_downloads/rna_single_cell_type.tsv.zip'
extract_path = '/home/mad1188/Enrichr_database/'

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

tsv_file_path = os.path.join(extract_path, 'rna_single_cell_type.tsv')


In [24]:
import pandas as pd

# Assuming the file is unzipped and named 'rna_single_cell_type.tsv'
data = pd.read_csv(tsv_file_path, sep='\t')


In [25]:
# List of genes to check
genes_to_check = ['SPP1', 'CFTR', 'TSPAN8', 'CXCL1', 'TPM1', 'SERPINA1', 'KRT19', 'TFPI2', 'GPX2']

# Filter the data for these genes
gene_expression_data = data[data['Gene name'].isin(genes_to_check)]


In [26]:
# You can print the data to the console
print(gene_expression_data)

# Or export it to a CSV file
gene_expression_data.to_csv('/home/mad1188/Enrichr_database/gene_expression_data.csv', index=False)


                    Gene Gene name                  Cell type   nTPM
1215     ENSG00000001626      CFTR                 Adipocytes    0.0
1216     ENSG00000001626      CFTR      Alveolar cells type 1    2.2
1217     ENSG00000001626      CFTR      Alveolar cells type 2   37.9
1218     ENSG00000001626      CFTR                 Astrocytes    0.0
1219     ENSG00000001626      CFTR                    B-cells    0.1
...                  ...       ...                        ...    ...
1330258  ENSG00000197249  SERPINA1  Squamous epithelial cells    0.2
1330259  ENSG00000197249  SERPINA1   Suprabasal keratinocytes    0.5
1330260  ENSG00000197249  SERPINA1       Syncytiotrophoblasts    5.1
1330261  ENSG00000197249  SERPINA1                    T-cells   49.6
1330262  ENSG00000197249  SERPINA1     Undifferentiated cells  101.6

[729 rows x 4 columns]


In [27]:
# Define the file paths
zip_file_path1 = '/home/mad1188/Enrichr_database/HPA_downloads/rna_single_cell_cluster_description.tsv.zip'
extract_path1 = '/home/mad1188/Enrichr_database/'

# Unzip the file
with zipfile.ZipFile(zip_file_path1, 'r') as zip_ref:
    zip_ref.extractall(extract_path1)

tsv_file_path1 = os.path.join(extract_path1, 'rna_single_cell_cluster_description.tsv')

In [28]:
import pandas as pd

# Assuming the file is unzipped and named 'rna_single_cell_type.tsv'
data1 = pd.read_csv(tsv_file_path1, sep='\t')

In [29]:
print(data1)

             Tissue Cluster            Cell type       Cell type group  \
0    Adipose tissue     c-0          Fibroblasts     Mesenchymal cells   
1    Adipose tissue     c-1              T-cells  Blood & immune cells   
2    Adipose tissue     c-2              T-cells  Blood & immune cells   
3    Adipose tissue     c-3              T-cells  Blood & immune cells   
4    Adipose tissue     c-4             Nk-cells  Blood & immune cells   
..              ...     ...                  ...                   ...   
552        Vascular    c-15    Endothelial cells     Endothelial cells   
553        Vascular    c-16    Endothelial cells     Endothelial cells   
554        Vascular    c-17  Smooth muscle cells          Muscle cells   
555        Vascular    c-18        Schwann cells           Glial cells   
556        Vascular    c-19    Endothelial cells     Endothelial cells   

     Cell count  
0         14945  
1         10011  
2          7279  
3          7200  
4          5823  
.. 

In [30]:
# Assuming gene_expression_data is already loaded with the required gene expression data
# and that it contains a 'Cell type' column.

# Perform a merge on the 'Cell type' column
combined_data = pd.merge(gene_expression_data, data1, on='Cell type', how='left')

# The 'how' parameter is 'left' to keep all rows from the gene_expression_data DataFrame
# and only add matching rows from the data DataFrame. Change it to 'inner' if you only want
# to keep rows with matching 'Cell type' in both DataFrames.

# Now, combined_data should have the gene expression data along with the 'Tissue' information

# Check the combined data
print(combined_data.head())

# Optional: Save the combined data to a file
combined_data.to_csv('/home/mad1188/Enrichr_database/combined_gene_expression_data.csv', index=False)


              Gene Gene name   Cell type  nTPM          Tissue Cluster  \
0  ENSG00000001626      CFTR  Adipocytes   0.0  Adipose tissue    c-12   
1  ENSG00000001626      CFTR  Adipocytes   0.0          Breast     c-6   
2  ENSG00000001626      CFTR  Adipocytes   0.0          Breast     c-7   
3  ENSG00000001626      CFTR  Adipocytes   0.0          Breast     c-8   
4  ENSG00000001626      CFTR  Adipocytes   0.0          Breast    c-23   

  Cell type group  Cell count  
0      Adipocytes      1746.0  
1      Adipocytes      2658.0  
2      Adipocytes      2380.0  
3      Adipocytes      2368.0  
4      Adipocytes       175.0  


In [31]:
# Filter for pancreas tissue and the specified genes
filtered_data = combined_data[
    (combined_data['Tissue'] == 'Pancreas') & 
    (combined_data['Gene name'].isin(genes_to_check))
]

In [32]:
# Perform any specific analysis you need, for example, mean expression
# Here, we simply print out the filtered data
print(filtered_data)

# Optional: Save the filtered data to a CSV file for further analysis
filtered_data.to_csv('/home/mad1188/Enrichr_database/pancreas_expression_data.csv', index=False)

                 Gene Gene name                   Cell type    nTPM    Tissue  \
99    ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
100   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
101   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
102   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
103   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
...               ...       ...                         ...     ...       ...   
4441  ENSG00000197249  SERPINA1    Exocrine glandular cells   253.4  Pancreas   
4442  ENSG00000197249  SERPINA1    Exocrine glandular cells   253.4  Pancreas   
4465  ENSG00000197249  SERPINA1                 Fibroblasts    62.9  Pancreas   
4577  ENSG00000197249  SERPINA1                 Macrophages   377.3  Pancreas   
4616  ENSG00000197249  SERPINA1  Pancreatic endocrine cells  1107.1  Pancreas   

     Cluster               

In [35]:
import pandas as pd
import zipfile
import os

# Define the file paths
zip_file_path_consensus = '/home/mad1188/Enrichr_database/HPA_downloads/rna_tissue_consensus.tsv.zip'
extract_path_consensus = '/home/mad1188/Enrichr_database/'

# Unzip the file
with zipfile.ZipFile(zip_file_path_consensus, 'r') as zip_ref:
    zip_ref.extractall(extract_path_consensus)

tsv_file_path_consensus = os.path.join(extract_path_consensus, 'rna_tissue_consensus.tsv')

# Load the data into a pandas DataFrame
consensus_data = pd.read_csv(tsv_file_path_consensus, sep='\t')



In [36]:
print(consensus_data)

                    Gene Gene name           Tissue  nTPM
0        ENSG00000000003    TSPAN6   adipose tissue  28.6
1        ENSG00000000003    TSPAN6    adrenal gland  17.6
2        ENSG00000000003    TSPAN6         amygdala  13.0
3        ENSG00000000003    TSPAN6         appendix   4.7
4        ENSG00000000003    TSPAN6    basal ganglia  12.4
...                  ...       ...              ...   ...
1000031  ENSG00000291317   TMEM276           thymus   6.0
1000032  ENSG00000291317   TMEM276    thyroid gland  11.3
1000033  ENSG00000291317   TMEM276           tongue   6.3
1000034  ENSG00000291317   TMEM276           tonsil   4.8
1000035  ENSG00000291317   TMEM276  urinary bladder   6.5

[1000036 rows x 4 columns]


In [54]:
import pandas as pd
import zipfile
import os

# Define the file paths for the single cell type data
zip_file_path = '/home/mad1188/Enrichr_database/HPA_downloads/rna_single_cell_type.tsv.zip'
extract_path = '/home/mad1188/Enrichr_database/'

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

tsv_file_path = os.path.join(extract_path, 'rna_single_cell_type.tsv')

# Load the single cell type data into a pandas DataFrame
single_cell_data = pd.read_csv(tsv_file_path, sep='\t')

# List of genes to check
genes_to_check = ['SPP1', 'CFTR', 'TSPAN8', 'CXCL1', 'TPM1', 'SERPINA1', 'KRT19', 'TFPI2', 'GPX2']

# Filter the data for these genes
gene_expression_data = single_cell_data[single_cell_data['Gene name'].isin(genes_to_check)]

# Export the gene expression data to a CSV file
gene_expression_data.to_csv('/home/mad1188/Enrichr_database/gene_expression_data.csv', index=False)

# Define the file paths for the single cell cluster description data
zip_file_path1 = '/home/mad1188/Enrichr_database/HPA_downloads/rna_single_cell_cluster_description.tsv.zip'

# Unzip the file
with zipfile.ZipFile(zip_file_path1, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

tsv_file_path1 = os.path.join(extract_path, 'rna_single_cell_cluster_description.tsv')

# Load the cluster description data into a pandas DataFrame
cluster_description_data = pd.read_csv(tsv_file_path1, sep='\t')

# Perform a merge on the 'Cell type' column
combined_data = pd.merge(gene_expression_data, cluster_description_data, on='Cell type', how='left')

# Export the combined data to a CSV file
combined_data.to_csv('/home/mad1188/Enrichr_database/combined_gene_expression_data.csv', index=False)

# Filter for Pancreas tissue
pancreas_data = combined_data[combined_data['Tissue'] == 'Pancreas']

# Export the filtered data for Pancreas tissue to a CSV file
pancreas_data.to_csv('/home/mad1188/Enrichr_database/pancreas_expression_data.csv', index=False)


# Display the consensus data
print(pancreas_data)


                 Gene Gene name                   Cell type    nTPM    Tissue  \
99    ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
100   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
101   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
102   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
103   ENSG00000001626      CFTR                Ductal cells   590.0  Pancreas   
...               ...       ...                         ...     ...       ...   
4441  ENSG00000197249  SERPINA1    Exocrine glandular cells   253.4  Pancreas   
4442  ENSG00000197249  SERPINA1    Exocrine glandular cells   253.4  Pancreas   
4465  ENSG00000197249  SERPINA1                 Fibroblasts    62.9  Pancreas   
4577  ENSG00000197249  SERPINA1                 Macrophages   377.3  Pancreas   
4616  ENSG00000197249  SERPINA1  Pancreatic endocrine cells  1107.1  Pancreas   

     Cluster               