<a href="https://colab.research.google.com/github/chunribu/biotable/blob/main/src/omim_db_full_data_sqlite3_to_tsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install `omim` providing OMIM data (maybe need update manually)

In [1]:
!pip install --quiet omim
!omim -h

[K     |████████████████████████████████| 3.3 MB 4.1 MB/s 
[K     |████████████████████████████████| 46 kB 2.8 MB/s 
[K     |████████████████████████████████| 86 kB 4.5 MB/s 
[?25hUsage: omim [OPTIONS] COMMAND [ARGS]...

  [32m[1mOMIM - Online Mendelian Inheritance
  in Man[0m

Options:
  -d, --dbfile TEXT  the path of database file
                     [default: /usr/local/lib/pyth
                     on3.7/dist-packages/omim/data
                     /omim.sqlite3]

  -u, --url TEXT     the url of omim  [default:
                     https://mirror.omim.org]

  --version          Show the version and exit.
  -?, -h, --help     Show this message and exit.

Commands:
  faq     [32mexplains of some faq[0m
  query   [32mquery something from database[0m
  stats   [32mstatistics of the database[0m
  update  [32mupdate the database[0m


## Load OMIM DB data provided by omim package

In [2]:
import pandas as pd
import sqlite3
import json

In [3]:
# use the path to database file printed in help
con = sqlite3.connect("/usr/local/lib/python3.7/dist-packages/omim/data/omim.sqlite3")
omim = pd.read_sql('select * from omim', con)
omim.to_csv('omim_package_db.tsv.gz',  sep='\t')
con.close()

In [4]:
omim

Unnamed: 0,mim_number,prefix,title,references,geneMap,phenotypeMap,mim_type,entrez_gene_id,ensembl_gene_id,hgnc_gene_symbol,generated
0,100640,*,"ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER A1; AL...","3943866, 6723659, 2591967, 2987944, 3013004, 2...",,,gene,216,ENSG00000165092,ALDH1A1,2021-04-14 00:00:00.000000
1,100650,+,ALDEHYDE DEHYDROGENASE 2 FAMILY; ALDH2,"7013538, 1244489, 3017845, 15863807, 10441588,...","[{""Location"": ""12q24.12"", ""Phenotype"": ""{Esoph...",,gene/phenotype,217,ENSG00000111275,ALDH2,2021-04-14 00:00:00.000000
2,100660,*,"ALDEHYDE DEHYDROGENASE, FAMILY 3, SUBFAMILY A,...","7774944, 1737758, 9391071, 9027499, 4073832, 7...",,,gene,218,ENSG00000108602,ALDH3A1,2021-04-14 00:00:00.000000
3,100670,*,"ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER B1; AL...","6985464, 7774944, 2061311",,,gene,219,ENSG00000137124,ALDH1B1,2021-04-14 00:00:00.000000
4,100678,*,ACETYL-CoA ACETYLTRANSFERASE 2; ACAT2,"7904580, 20597, 1850510, 8812443, 2475872, 791...","[{""Location"": ""6q25.3"", ""Phenotype"": ""?ACAT2 d...",,gene,39,ENSG00000120437,ACAT2,2021-04-14 00:00:00.000000
...,...,...,...,...,...,...,...,...,...,...,...
27137,619278,#,"MICROCEPHALY, EPILEPSY, AND DIABETES SYNDROME ...",33164986,,"[{""Location"": ""5q31.3"", ""Phenotype"": ""Microcep...",phenotype,,,,2021-04-19 00:00:00.000000
27138,619280,*,COILED-COIL DOMAIN-CONTAINING PROTEIN 59; CCDC59,16630564,,,gene,29080,ENSG00000133773,CCDC59,2021-04-19 00:00:00.000000
27139,619268,#,ALZAHRANI-KUWAHARA SYNDROME; ALKUS,33242396,,"[{""Location"": ""17q22"", ""Phenotype"": ""Alzahrani...",phenotype,,,,2021-04-20 00:00:00.000000
27140,619279,#,PARKINSONISM WITH POLYNEUROPATHY; PKNPY,33141179,,"[{""Location"": ""3p21.31"", ""Phenotype"": ""Parkins...",phenotype,,,,2021-04-20 00:00:00.000000


## Merge geneMap and phenotypeMap informations together

In [5]:
_map = omim.geneMap.copy()
print(_map.isna().value_counts()[False])
_map.update(omim.phenotypeMap)
_map.isna().value_counts()[False]

5650


11486

## Parse `_map` in `json` module

In [6]:
_map_list = _map.apply(lambda x: json.loads(x) if x!=None else [{}])

## Make duplicate rows according to the length of each `_map_list`

In [7]:
lens = _map_list.str.len()
idx = omim.index.repeat(lens)

In [8]:
lens.sum()

30500

In [9]:
len(_map_list.sum())

30500

In [10]:
df_map = pd.DataFrame(_map_list.sum())

In [11]:
df_map = df_map.replace('', None)

## Merge the original and new-generated DataFrames

In [12]:
omim_merged = omim.loc[idx].reset_index().merge(df_map, left_index=True, right_index=True)

In [13]:
omim_merged.head()

Unnamed: 0,index,mim_number,prefix,title,references,geneMap,phenotypeMap,mim_type,entrez_gene_id,ensembl_gene_id,hgnc_gene_symbol,generated,Location,Phenotype,Phenotype MIM number,Inheritance,Phenotype mapping key,Gene/Locus,Gene/Locus MIM number
0,0,100640,*,"ALDEHYDE DEHYDROGENASE 1 FAMILY, MEMBER A1; AL...","3943866, 6723659, 2591967, 2987944, 3013004, 2...",,,gene,216,ENSG00000165092,ALDH1A1,2021-04-14 00:00:00.000000,,,,,,,
1,1,100650,+,ALDEHYDE DEHYDROGENASE 2 FAMILY; ALDH2,"7013538, 1244489, 3017845, 15863807, 10441588,...","[{""Location"": ""12q24.12"", ""Phenotype"": ""{Esoph...",,gene/phenotype,217,ENSG00000111275,ALDH2,2021-04-14 00:00:00.000000,12q24.12,"{Esophageal cancer, alcohol-related, susceptib...",,,3.0,,
2,1,100650,+,ALDEHYDE DEHYDROGENASE 2 FAMILY; ALDH2,"7013538, 1244489, 3017845, 15863807, 10441588,...","[{""Location"": ""12q24.12"", ""Phenotype"": ""{Esoph...",,gene/phenotype,217,ENSG00000111275,ALDH2,2021-04-14 00:00:00.000000,12q24.12,"{Hangover, susceptibility to}",610251.0,AD,3.0,,
3,1,100650,+,ALDEHYDE DEHYDROGENASE 2 FAMILY; ALDH2,"7013538, 1244489, 3017845, 15863807, 10441588,...","[{""Location"": ""12q24.12"", ""Phenotype"": ""{Esoph...",,gene/phenotype,217,ENSG00000111275,ALDH2,2021-04-14 00:00:00.000000,12q24.12,"{Sublingual nitroglycerin, susceptibility to p...",610251.0,AD,3.0,,
4,1,100650,+,ALDEHYDE DEHYDROGENASE 2 FAMILY; ALDH2,"7013538, 1244489, 3017845, 15863807, 10441588,...","[{""Location"": ""12q24.12"", ""Phenotype"": ""{Esoph...",,gene/phenotype,217,ENSG00000111275,ALDH2,2021-04-14 00:00:00.000000,12q24.12,"Alcohol sensitivity, acute",610251.0,AD,3.0,,


## Select fields to save locally

In [14]:
omim_merged = omim_merged[['mim_number','mim_type','prefix','title','references','entrez_gene_id','ensembl_gene_id','hgnc_gene_symbol','Phenotype','Phenotype MIM number','Inheritance','Location','Gene/Locus','Gene/Locus MIM number','generated']]

In [15]:
omim_merged.to_csv('OMIM.flat.tsv.gz', sep='\t', index=False)