In [None]:
import pandas, re, collections, importlib, scipy, os
pd = pandas
import matplotlib.pyplot as plt
import scipy.stats
import numpy as np

In [None]:
os.chdir('/Users/dfporter/smaller_projects/perturb/')

In [None]:
# Get GGR RNA expression.

# Translation table.
ensg_id = pandas.read_csv('translations/ensg_enst_refseq_name.txt', sep='\t')
print(ensg_id.shape)
ensg_id.drop_duplicates(subset=['Ensembl Gene ID'], inplace=True)
ensg_to_id = dict(zip(ensg_id['Ensembl Gene ID'], ensg_id['Associated Gene Name']))

# GGR data.
ggr_tpm = pandas.read_csv('ggr/ggr.rna.tpm.pc.mat.txt', sep='\t')
ggr_tpm['max'] = ggr_tpm.max(axis=1)
ggr_tpm = ggr_tpm.loc[[x>0 for x in ggr_tpm['max']], :]
ggr_tpm['log_max'] = np.log10(ggr_tpm['max'])
ggr_tpm['Gene name'] = [ensg_to_id.get(_id, 'None') for _id in ggr_tpm.gene_id]

#ggr_tpm['Expressed'] = [
#    {True: 'Yes', False: 'No'}[bool(x>0.5)] for x in ggr_tpm['log_max']
#]

name_to_expression = dict(zip(ggr_tpm['Gene name'], ggr_tpm['log_max']))

In [None]:
exp_vals = ggr_tpm['log_max']
#exp_vals = exp_vals[exp_vals<3]
plt.hist(exp_vals, bins=100)
plt.show()
plt.clf()
plt.close()

In [None]:
# Get protein expression in skin
nt = pandas.read_csv('./proteinatlas/normal_tissue.tsv', sep='\t')
skin = nt.loc[[bool(re.search('Skin', x, re.IGNORECASE)) for x in nt.Tissue], :]

print(skin['Cell type'].value_counts())

kc_prot = skin.loc[[(x in ['keratinocytes', 'epidermal cells']) for x in skin['Cell type']], :]
name_to_protein_level = dict(zip(kc_prot['Gene name'], kc_prot.Level))

print(kc_prot['Level'].value_counts())

In [None]:
df = pandas.read_csv('./G-protein_library.txt', header=None)
df.columns = ['Gene name']
df.drop_duplicates(subset='Gene name', inplace=True)
gene_names_translatable = set(ensg_to_id.values())
def lookup(symbol):
    if symbol in name_to_expression:
        return 'Found in GGR'
    elif symbol in gene_names_translatable:
        return 'In Translation table, not in GGR data.'
    return 'Not in translation table.'

df['log_max_ggr_rna_tpm'] = [name_to_expression.get(x, -3) for x in df['Gene name']]
df['Protein_atlas_protein_exp_in_skin'] = [name_to_protein_level.get(x, 'Not found') for x in df['Gene name']]

df['lookup'] = [lookup(x) for x in df['Gene name']]
df['Expressed?'] = [float(x)>0 for x in df.log_max_ggr_rna_tpm]
exp_in_skin = df[df['Expressed?']]
print(df['lookup'].value_counts())
print(f'All in G-protein library: {df.shape[0]}')
print(f'Expressed in skin: {exp_in_skin.shape[0]}')
#print(f'{df}')
print(df)
df.to_excel('G-protein_library.xlsx')