## Import EBI's gene2phenotype

### Read Data from CSV

In [51]:
import pandas as pd
import numpy as np
df = pd.read_csv('CancerG2P_29_8_2018.csv', sep=',')
df.head()

Unnamed: 0,gene symbol,gene mim,disease name,disease mim,DDD category,allelic requirement,mutation consequence,phenotypes,organ specificity list,pmids,panel,prev symbols,hgnc id,gene disease pair entry date
0,APC,No gene mim,"DESMOID DISEASE, HEREDITARY",No disease mim,confirmed,monoallelic,,,,8940264;8968744;10077730;10782927,Cancer,,583,2016-11-29 13:19:26
1,APC,No gene mim,ADENOMATOUS POLYPOSIS COLI,No disease mim,confirmed,monoallelic,loss of function,,,1651174;8252630;10494086;8019566;7661930;92881...,Cancer,,583,2016-11-29 13:19:26
2,ATM,607585,ATAXIA-TELANGIECTASIA,208900,confirmed,biallelic,loss of function,,,9443866;11889466;22345219;9450874;8808599;9600...,Cancer,ATD;ATA;ATDC;ATC,795,2016-11-29 13:19:26
3,BAP1,No gene mim,TUMOR PREDISPOSITION SYNDROME,No disease mim,confirmed,monoallelic,loss of function,,,21874003;21941004;21874000;23684012,Cancer,,950,2016-11-29 13:19:26
4,BMPR1A,No gene mim,"JUVENILE POLYPOSIS SYNDROME, INFANTILE FORM",No disease mim,confirmed,monoallelic,loss of function,,,9811934;9582123;10398437,Cancer,ACVRLK3,1076,2016-11-29 13:19:26


### Separate hgnc_id

In [52]:
hgnc_id = df.loc[:,["hgnc id"]] 
hgnc_id.head()

Unnamed: 0,hgnc id
0,583
1,583
2,795
3,950
4,1076


In [53]:
hgnc_id = hgnc_id.values
hgnc_id[:5]

array([[ 583],
       [ 583],
       [ 795],
       [ 950],
       [1076]], dtype=int64)

In [55]:
hgnc_id.shape

(128, 1)

In [54]:
del df['hgnc id']
df.head()

Unnamed: 0,gene symbol,gene mim,disease name,disease mim,DDD category,allelic requirement,mutation consequence,phenotypes,organ specificity list,pmids,panel,prev symbols,gene disease pair entry date
0,APC,No gene mim,"DESMOID DISEASE, HEREDITARY",No disease mim,confirmed,monoallelic,,,,8940264;8968744;10077730;10782927,Cancer,,2016-11-29 13:19:26
1,APC,No gene mim,ADENOMATOUS POLYPOSIS COLI,No disease mim,confirmed,monoallelic,loss of function,,,1651174;8252630;10494086;8019566;7661930;92881...,Cancer,,2016-11-29 13:19:26
2,ATM,607585,ATAXIA-TELANGIECTASIA,208900,confirmed,biallelic,loss of function,,,9443866;11889466;22345219;9450874;8808599;9600...,Cancer,ATD;ATA;ATDC;ATC,2016-11-29 13:19:26
3,BAP1,No gene mim,TUMOR PREDISPOSITION SYNDROME,No disease mim,confirmed,monoallelic,loss of function,,,21874003;21941004;21874000;23684012,Cancer,,2016-11-29 13:19:26
4,BMPR1A,No gene mim,"JUVENILE POLYPOSIS SYNDROME, INFANTILE FORM",No disease mim,confirmed,monoallelic,loss of function,,,9811934;9582123;10398437,Cancer,ACVRLK3,2016-11-29 13:19:26


In [47]:
dt = df.values # convert to numpy ndarray
dt.shape

(128, 13)

### Check Data Type

In [58]:
hgnc_id[0,0]

583

In [56]:
type(hgnc_id[0,0])  # expect a number instead of a string (no conversion needed)

numpy.int64

### Construct Result

In [66]:
props_names = list(df)
props_names

['gene symbol',
 'gene mim',
 'disease name',
 'disease mim',
 'DDD category',
 'allelic requirement',
 'mutation consequence',
 'phenotypes',
 'organ specificity list',
 'pmids',
 'panel',
 'prev symbols',
 'gene disease pair entry date']

In [None]:
result_list=[]

for x in range(0, len(dt)):
    dict_gene = {}
    for y in range(0, len(dt[0])):
        dict_gene[props_names[y]] = dt[x,y];
    dict_item = {
        "_id": hgnc_id[x,0],
         "gene2phenotype": dict_gene
    }
    result_list.append(dict_item)

### Validate Result

In [73]:
result_list[:2]

[{'_id': 583,
  'gene2phenotype': {'DDD category': 'confirmed',
   'allelic requirement': 'monoallelic',
   'disease mim': 'No disease mim',
   'disease name': 'DESMOID DISEASE, HEREDITARY',
   'gene disease pair entry date': '2016-11-29 13:19:26',
   'gene mim': 'No gene mim',
   'gene symbol': 'APC',
   'mutation consequence': nan,
   'organ specificity list': nan,
   'panel': 'Cancer',
   'phenotypes': nan,
   'pmids': '8940264;8968744;10077730;10782927',
   'prev symbols': nan}},
 {'_id': 583,
  'gene2phenotype': {'DDD category': 'confirmed',
   'allelic requirement': 'monoallelic',
   'disease mim': 'No disease mim',
   'disease name': 'ADENOMATOUS POLYPOSIS COLI',
   'gene disease pair entry date': '2016-11-29 13:19:26',
   'gene mim': 'No gene mim',
   'gene symbol': 'APC',
   'mutation consequence': 'loss of function',
   'organ specificity list': nan,
   'panel': 'Cancer',
   'phenotypes': nan,
   'pmids': '1651174;8252630;10494086;8019566;7661930;9288102;9950370;2164769;15771

In [74]:
df = pd.read_csv('CancerG2P_29_8_2018.csv', sep=',')
df.loc[df['hgnc id'] == 583] # query by id

Unnamed: 0,gene symbol,gene mim,disease name,disease mim,DDD category,allelic requirement,mutation consequence,phenotypes,organ specificity list,pmids,panel,prev symbols,hgnc id,gene disease pair entry date
0,APC,No gene mim,"DESMOID DISEASE, HEREDITARY",No disease mim,confirmed,monoallelic,,,,8940264;8968744;10077730;10782927,Cancer,,583,2016-11-29 13:19:26
1,APC,No gene mim,ADENOMATOUS POLYPOSIS COLI,No disease mim,confirmed,monoallelic,loss of function,,,1651174;8252630;10494086;8019566;7661930;92881...,Cancer,,583,2016-11-29 13:19:26
