In [None]:
import os
import re
import sys
import json
import requests
import datetime
import numpy as np
import pandas as pd

In [None]:
# targets for demo (except for CLTA - see below)
target_names = [
    'ATL2',
    'ATL3',
    'LAMP1',
    'LMAN1',
    'MTOR',
    'OSBPL8',
    'RAB14',
    'SEC13',
    'SEC24D',
    'SPTLC1',
    'TAF12',
    'VAPA',
    'ATP2B1',
    'LMNA',
    'POLR1D',
    'LMNB1',
    'POLR1A',
    'POLR1C',
    'CSNK2A2',
    'CSNK2A1',
    'TOP2A'
]

len(target_names)

In [None]:
# download polyclonalline data from the pipeline database API
# (this include metadata, FACS data, and some sequencing results)
result = requests.get('http://localhost:5000/polyclonallines')
all_data = result.json()

In [None]:
# select the data for the demo targets
data = [row for row in all_data if row['target_name'] in target_names]

In [None]:
# CLTA is a control; pick the one from plate1
data.append(
    [row for row in all_data if row['target_name']=='CLTA' and row['plate_design_id']=='P0001'][0])

In [None]:
len([row['target_name'] for row in data])

In [None]:
# key the data by target_name
ddata = {}
for row in data:
    ddata[row['target_name']] = row

In [None]:
with open('../src/demo/data/20190816_pipeline-metadata.json', 'w') as file:
    json.dump(ddata, file)

In [None]:
# generate the data for the tpm-vs-GFP scatterplot 
expression_data = []
for row in all_data:
    expression_data.append({
        'target_name': row['target_name'],
        'tpm': np.log10(row['hek_tpm']) if row.get('hek_tpm') else None,
        'gfp': row['facs_results'].get('rel_median_log')
    })

In [None]:
with open('../src/demo/data/20190819_expression-data.json', 'w') as file:
    json.dump(expression_data, file)

### Parse metadata downloaded from UniprotKB

In [None]:
# This CSV was acquired by manually selecting each gene in uniport web UI and then using the download link in the 'basket' popup. 
d = pd.read_csv(
    '/Users/keith.cheveralls/Downloads/uniprot-yourlist_M201910086746803381A1F0E0DB47453E0216320D00360D1.tab',
    delimiter='\t')

In [None]:
# this CSV was generated by downloading the top hit for each target_name in the database
d = pd.read_csv('/Users/keith.cheveralls/Downloads/2019-12-16_top-uniprotKB-hit-for-all-targets.csv')

In [None]:
d.rename(columns={
    'Function [CC]': 'function',
    'Domain [CC]': 'domain',
    'Entry': 'uniprot_id',
}, inplace=True)

In [None]:
d.rename(columns={c: c.replace(' ', '_').lower() for c in d.columns}, inplace=True)

In [None]:
# the first name in the list of gene names corresponds to our gene names
d['gene_name'] = [name.split(' ')[0] for name in d.gene_names]

In [None]:
d = d.sort_values(by='gene_name')

In [None]:
d.head()

In [None]:
def clean_function(s):
    
    if pd.isna(s):
        return None

    # sometimes there are two annotations concatenated
    s = s.replace('; FUNCTION: ', ' ')
    s = s.replace('FUNCTION: ', '')

    # remove all paranthetical pubmed citations
    s = re.sub(r' \(((PubMed:[0-9]+)(, )?)+\)', '', s)

    # remove the trailing pubmed citations (always in brackets at the end)
    s = re.sub(r' {.*}.', '', s)
    return s

In [None]:
for ind, row in d.iterrows():
    print('\n%s\n%s' % (row.gene_name, row.protein_names))

In [None]:
for ind, row in d.iterrows():
    print('\n%s\n%s' % (row.gene_name, clean_function(row.function)))

In [None]:
uniprot_metadata = {}
for ind, row in d.iterrows():
    uniprot_metadata[row.target_name] = {
        'uniprot_function': clean_function(row.function),
        'uniprot_id': row.uniprot_id,
        'protein_name': row.protein_names,
    }

In [None]:
with open('/Users/keith.cheveralls/projects/opencell-vis/src/demo/data/uniprot_metadata.json', 'w') as file:
    json.dump(uniprot_metadata, file)