## Import EBI's gene2phenotype
###  1. Using Python CSV (Alternative Method)
#### Read CSVs

In [41]:
import csv
dt1 = list(csv.reader(open('CancerG2P_29_8_2018.csv')))
dt2 = list(csv.reader(open('DDG2P_29_8_2018.csv')))
print(len(dt1),len(dt2))

129 2332


In [42]:
# review data entry format without header
dt1[1]

['APC',
 'No gene mim',
 'DESMOID DISEASE, HEREDITARY',
 'No disease mim',
 'confirmed',
 'monoallelic',
 '',
 '',
 '',
 '8940264;8968744;10077730;10782927',
 'Cancer',
 '',
 '583',
 '2016-11-29 13:19:26']

In [43]:
# confirm shapes of two csvs are the same
print(len(dt1[0]),len(dt2[0]))

14 14


#### Clean Data

In [44]:
# extract property names
props_names = dt1[0]
props_names

['gene symbol',
 'gene mim',
 'disease name',
 'disease mim',
 'DDD category',
 'allelic requirement',
 'mutation consequence',
 'phenotypes',
 'organ specificity list',
 'pmids',
 'panel',
 'prev symbols',
 'hgnc id',
 'gene disease pair entry date']

In [45]:
dt1_header_removed = dt1[1:]
dt2_header_removed = dt2[1:]

In [46]:
def cleanCSV(ls):
    for row in ls:
        if (row[1]=='No gene mim'): row[1]=''
        if (row[3]=='No disease mim'): row[3]=''
        
cleanCSV(dt1_header_removed)
cleanCSV(dt2_header_removed)

# review the result of data cleaning
dt1_header_removed[0]

['APC',
 '',
 'DESMOID DISEASE, HEREDITARY',
 '',
 'confirmed',
 'monoallelic',
 '',
 '',
 '',
 '8940264;8968744;10077730;10782927',
 'Cancer',
 '',
 '583',
 '2016-11-29 13:19:26']

#### Combine Data Frames

In [47]:
print(len(dt1_header_removed), len(dt2_header_removed))

128 2331


In [49]:
dt = dt1_header_removed + dt2_header_removed
len(dt)

2459

In [54]:
for row in dt:
    if len(row)!=14:
        print('Mismatch!')
print('Checking Done')

Checking Done


### 2. Parse

In [61]:
from datetime import datetime

result_dict={} # intermediate dict construct, with unique ids

for x in range(len(dt)):  # of all observations
    dict_gene = {} # each observation's storage to attach to the 'gene2phenotype' of its unique id in main dict
    
    for y in range(len(dt[0])): # of all properties
        if dt[x][y]!='': # of not empty properties
            if (y==2 or y==3):
                # additional processsing for disease name and mim
                if 'disease' not in dict_gene:
                    dict_gene['disease'] = {}
                dict_gene['disease'][props_names[y]] = dt[x][y]
            elif (y==7 or y==8 or y==11): 
                # additional processing for rgan specificity list , pre symbols, phenotypes
                dict_gene[props_names[y]] = dt[x][y].split(';')
            elif (y==9): 
                # additional processing for pmids (list of integers)
                dict_gene[props_names[y]] = [int(x) for x in dt[x][y].split(';')]
            elif (y==12):
                continue
            elif (y==13):
                dict_gene[props_names[y]] = datetime.strptime(dt[x][y], '%Y-%m-%d %H:%M:%S')
            else:
                dict_gene[props_names[y]] = dt[x][y]
    
    if int(dt[x][12]) in result_dict:
        list_gene = result_dict[int(dt[x][12])]['gene2phenotype'];
        list_gene.append(dict_gene)
        result_dict[int(dt[x][12])]['gene2phenotype'] = list_gene;
    else:
        dict_item = {
            "_id": int(dt[x][12]),
             "gene2phenotype": [dict_gene]
        }        
        result_dict[int(dt[x][12])] = dict_item
        
result_list = list(result_dict.values())

len(result_list)

1805

### 3. Output

In [62]:
import io
import json
file = io.open("result_from_pythoncsv.txt","w",encoding='utf8');
file.write(json.dumps(result_list, indent=4, sort_keys=True, default=str));
file.close();