# Description

This notebook preprocess data about understudied genes from this article: https://doi.org/10.1371/journal.pbio.2006643

# Modules

In [1]:
import pandas as pd

from ccc import conf

# Settings

# Paths

In [2]:
OUTPUT_DIR = conf.UNDERSTUDIED_GENES_ARTICLE["DATA_DIR"]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/data/understudied_genes')

# Load data

## S1 Data

In [3]:
s1_data_1a = pd.read_excel(
    conf.UNDERSTUDIED_GENES_ARTICLE["S1_DATA_FILE"], sheet_name="1A"
)

In [4]:
s1_data_1a.shape

(12948, 3)

In [5]:
s1_data_1a.head()

Unnamed: 0,gene_ncbi,target,predicted
0,1,1.447158,1.852724
1,10,2.786041,1.30082
2,13,1.30103,1.638264
3,14,1.462398,1.663359
4,15,1.50515,0.837965


In [6]:
s1_data_1a = s1_data_1a.assign(diff=s1_data_1a["predicted"] - s1_data_1a["target"])

In [7]:
s1_data_1a = s1_data_1a.set_index("gene_ncbi")

In [8]:
s1_data_1a.head()

Unnamed: 0_level_0,target,predicted,diff
gene_ncbi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.447158,1.852724,0.405566
10,2.786041,1.30082,-1.485221
13,1.30103,1.638264,0.337234
14,1.462398,1.663359,0.200961
15,1.50515,0.837965,-0.667185


In [9]:
assert s1_data_1a.index.is_unique

## S3 Table

In [10]:
s3_table = pd.read_excel(conf.UNDERSTUDIED_GENES_ARTICLE["S3_TABLE_FILE"])

In [11]:
s3_table.shape

(15056, 6)

In [12]:
s3_table.head()

Unnamed: 0,gene_ncbi,x,y,papers,symbol_ncbi,gene_ensembl
0,1,31.806663,-14.351444,28,A1BG,ENSG00000121410
1,9,19.067383,-60.613003,211,NAT1,ENSG00000171428
2,10,35.621129,26.070237,611,NAT2,ENSG00000156006
3,13,55.748198,-33.739022,20,AADAC,ENSG00000114771
4,14,-62.698633,-49.267867,29,AAMP,ENSG00000127837


In [13]:
s3_table = s3_table.set_index("gene_ncbi")

In [14]:
s3_table.head()

Unnamed: 0_level_0,x,y,papers,symbol_ncbi,gene_ensembl
gene_ncbi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,31.806663,-14.351444,28,A1BG,ENSG00000121410
9,19.067383,-60.613003,211,NAT1,ENSG00000171428
10,35.621129,26.070237,611,NAT2,ENSG00000156006
13,55.748198,-33.739022,20,AADAC,ENSG00000114771
14,-62.698633,-49.267867,29,AAMP,ENSG00000127837


In [15]:
assert s3_table.index.is_unique

## Get gene symbol to Entrez ID map

In [16]:
gene_id_map = (
    s3_table.reset_index()[["symbol_ncbi", "gene_ncbi"]]
    .set_index("symbol_ncbi")
    .squeeze()
    .to_dict()
)

In [17]:
assert gene_id_map["SDS"] == 10993

# Save

In [18]:
s1_data_1a.to_pickle(OUTPUT_DIR / "s1_data_1a.pkl")

In [19]:
s3_table.to_pickle(OUTPUT_DIR / "s3_table.pkl")

# Test with some specific genes

## SDS

In [20]:
_gene_symbol = "SDS"
_gene_id = gene_id_map[_gene_symbol]
display(_gene_id)

10993

In [21]:
s1_data_1a.loc[_gene_id]

target       1.176091
predicted    1.631851
diff         0.455760
Name: 10993, dtype: float64

Predicted is higher than observed.

In [22]:
s3_table.loc[_gene_id]

x                     -8.312354
y                    -32.875472
papers                       15
symbol_ncbi                 SDS
gene_ensembl    ENSG00000135094
Name: 10993, dtype: object

Only 15 papers. A search in PubMed of this genes gives... it's hard to find this gene in PubMed since it overlaps with other concepts.
But [another paper](https://doi.org/10.7554/eLife.93429.1) also identifies this gene with only 8 publications (see their data in [this GitHub repo](https://github.com/amarallab/fmug_analysis) and specifically [here](https://github.com/amarallab/fmug_analysis/blob/main/data/main_table_with_subject_counts_221116.csv)).

## ZDHHC12

In [23]:
_gene_symbol = "ZDHHC12"
_gene_id = gene_id_map[_gene_symbol]
display(_gene_id)

84885

In [24]:
s1_data_1a.loc[_gene_id]

target       1.041393
predicted    1.229045
diff         0.187653
Name: 84885, dtype: float64

Predicted is higher than observed.

In [25]:
s3_table.loc[_gene_id]

x                      4.050989
y                     -32.66591
papers                       11
symbol_ncbi             ZDHHC12
gene_ensembl    ENSG00000160446
Name: 84885, dtype: object

Only 11 papers.

## PRSS36

In [26]:
_gene_symbol = "PRSS36"
_gene_id = gene_id_map[_gene_symbol]
display(_gene_id)

146547

In [27]:
s1_data_1a.loc[_gene_id]

target       0.698970
predicted    1.038806
diff         0.339836
Name: 146547, dtype: float64

Predicted is higher than observed.

In [28]:
s3_table.loc[_gene_id]

x                     39.420575
y                    -35.103885
papers                        5
symbol_ncbi              PRSS36
gene_ensembl    ENSG00000178226
Name: 146547, dtype: object

Only 5 papers.

## CYTIP

In [29]:
_gene_symbol = "CYTIP"
_gene_id = gene_id_map[_gene_symbol]
display(_gene_id)

9595

In [30]:
s1_data_1a.loc[_gene_id]

target       1.301030
predicted    1.631478
diff         0.330448
Name: 9595, dtype: float64

In [31]:
s3_table.loc[_gene_id]

x                    -16.219896
y                    -18.785196
papers                       20
symbol_ncbi               CYTIP
gene_ensembl    ENSG00000115165
Name: 9595, dtype: object