## Within-cluster biophysical properties
__Keith Cheveralls__<br>
__October 2021__

This notebook retrieves and calculates various protein biophysical properties and then calculates and plots their mean values within either the localization-based or interactome-based clusters.

In [None]:
import IPython
import numpy as np
import pandas as pd
import pathlib
import seaborn as sns
import sys

from scipy import stats
from matplotlib import pyplot as plt
from matplotlib import rcParams

%load_ext autoreload
%autoreload 1

sys.path.append('../')
%aimport scripts.biophysical_properties.utils
from scripts.biophysical_properties import utils

IPython.display.set_matplotlib_formats('png2x')
rcParams['figure.dpi'] = 80
rcParams['savefig.dpi'] = 150
rcParams['pdf.fonttype'] = 42
rcParams['axes.grid'] = False
rcParams['figure.figsize'] = (6, 4)
rcParams['font.family'] = 'sans-serif'

fontsize = 14
rcParams['font.size'] = fontsize
rcParams['axes.labelsize'] = fontsize
rcParams['axes.titlesize'] = fontsize
rcParams['xtick.labelsize'] = fontsize
rcParams['ytick.labelsize'] = fontsize
rcParams['legend.fontsize'] = fontsize - 2

In [None]:
data_dir = pathlib.Path('../data/biophysical-properties/')
output_dir = pathlib.Path(
    '/Users/keith.cheveralls/Box/KC-opencell-paper/disorder-scores'
)
cache_dir = output_dir
timestamp = '2021-08-28'

# large datasets that are retrieved and cached locally
seqs_filepath = cache_dir / f'{timestamp}-all-uniprot-sequences.csv'
iupred_scores_filepath = cache_dir / f'{timestamp}-all-uniprot-iupred-scores.csv'
metapredict_scores_filepath = cache_dir / f'{timestamp}-all-uniprot-metapredict-scores.csv'

# small output datasets included in the github repo (in ../data)
props_filepath = data_dir / f'{timestamp}-all-uniprot-protein-properties.csv'
iupred_mean_scores_filepath = data_dir / f'{timestamp}-all-uniprot-iupred-scores-means.csv'
metapredict_mean_scores_filepath = data_dir / f'{timestamp}-all-uniprot-metapredict-scores-means.csv'

### Retrieve protein sequences from UniProt

In [None]:
# all human uniprot_ids in uniprot
df = pd.read_csv(cache_dir / 'all-human-uniprot-homo+sapiens-filtered-reviewed yes.tsv', sep='\t')
all_uniprot_ids = df.Entry.unique()
len(all_uniprot_ids)

In [None]:
seqs, missing_uniprot_ids = utils.get_sequences_from_uniprot(all_uniprot_ids)

In [None]:
seqs.shape, len(all_uniprot_ids), len(missing_uniprot_ids)

In [None]:
seqs.to_csv(dst_dir / seqs_filepath, index=False)

### Calc metapredict scores from sequences

In [None]:
seqs = pd.read_csv(seqs_filepath)
seqs.shape

In [None]:
scores = utils.calc_metapredict_scores(seqs)

In [None]:
scores.uniprot_id.unique().shape, scores.score.isna().sum()

In [None]:
scores.to_csv(metapredict_scores_filepath, index=False)

### Retrieve IUPred2A disorder scores using the IUPRED API

Note: this takes 7ish minutes per 1000 uniprot_ids.

In [None]:
scores = utils.get_iupred_scores(all_uniprot_ids)

In [None]:
scores.uniprot_id.unique().shape, len(all_uniprot_ids)

In [None]:
all_dfs.to_csv(dst_dir / iupred_scores_filepath, index=False)

### Calculate biophysical properties from sequences

In [None]:
seqs = pd.read_csv(seqs_filepath)
seqs.shape

In [None]:
props = utils.calc_biophysical_properties(seqs)

In [None]:
# count the number of errors
props.shape, props.molecular_weight.isna().sum()

In [None]:
props.to_csv(props_filepath, index=False)

### Calculate mean disorder score

In [None]:
score_kind = 'iupred'

# to calculate whole-sequence means only
window_sizes = []

scores_filepath = (
    iupred_scores_filepath if score_kind == 'iupred' else metapredict_scores_filepath
)

mean_scores_filepath = (
    iupred_mean_scores_filepath if score_kind == 'iupred' else metapredict_mean_scores_filepath
)

In [None]:
scores = pd.read_csv(scores_filepath)
scores = scores.sort_values(by=['uniprot_id', 'position'])
scores.shape, scores.uniprot_id.unique().shape

In [None]:
# sanity check a single uniprot_id
score = scores.loc[scores.uniprot_id == 'P51114']
plt.plot(score.score)
plt.plot(score.rolling(window=50).mean().score)

In [None]:
mean_scores = utils.calc_windowed_disorder_scores(scores, window_sizes=window_sizes)
mean_scores.shape, mean_scores.mean_score.isna().sum()

In [None]:
mean_scores.to_csv(mean_scores_filepath, index=False)

### Merge protein properties, disorder scores, and the leiden cluster_ids

In [None]:
# load the mass-spec or imaging (leiden) clusters
ms_clusters = pd.read_csv(data_dir / 'interactome-clusters.csv')
im_clusters = pd.read_csv(data_dir / '2021-08-23-leiden-clusters-res30-seed18.csv')

In [None]:
# load the generic biophysical properties and the disorder scores
props = pd.read_csv(props_filepath)
mean_scores = pd.read_csv(iupred_mean_scores_filepath)
mean_scores.shape

In [None]:
all_props = pd.merge(mean_scores, props, on='uniprot_id', how='inner')

In [None]:
# append RNA-binding flags from a separate dataset
rna_props = pd.read_csv(data_dir / 'external' / 'rna-binding-annotations.csv')
rna_binding_flag = rna_props[['uniprot_id', 'is_rna_bp']].copy()
rna_binding_flag['is_rna_bp'] = rna_binding_flag.is_rna_bp == 'RNA_BP'

all_props = pd.merge(
    all_props,
    rna_binding_flag,
    left_on='uniprot_id', 
    right_on='uniprot_id', 
    how='inner'
)

In [None]:
# manually choose either mass spec or spatial clusters
clusters = im_clusters.copy()
merged = pd.merge(clusters, all_props, on='uniprot_id', how='inner')
clusters.shape, merged.shape

### Plot the distribution of within-cluster means

In [None]:
cluster_id = 'leiden'
# cluster_id = 'mcl_community_id'

col = 'mean_score'
bins = np.arange(0, 0.85, .05)
label = 'disorder score'

# col = 'gravy'
# bins = np.arange(-1.5, 0.6, 0.1)
# label = 'hydrophobicity score'

# col = 'is_rna_bp'
# bins = np.arange(0, 1.1, 0.1)
# label = 'RNA-binding label'

tmp = merged.copy()

shuffled_means = []
for _ in range(100):
    values = tmp[col].values.copy()
    np.random.shuffle(values)
    tmp['shuffled'] = values
    shuffled_means.extend(list(tmp.groupby(cluster_id).mean().shuffled.values))

true_means = tmp.groupby(cluster_id).mean()[col].values

sns.histplot(shuffled_means, element="step", stat='probability', color='#aaa', bins=bins)
sns.histplot(true_means, element="step", stat='probability', color='#204eba', bins=bins, alpha=0.5)

if False:
    plt.savefig(
        output_dir / ('distribution-of-within-cluster-means--%s--%s.pdf' % (cluster_id, col,)), 
        bbox_inches='tight', 
        pad_inches=0.2
    )
    
plt.xlabel('Within-cluster means for %s' % (label or col))
plt.ylabel('Probability')

### Export a CSV of within-cluster mean disorder, hydrophobicity, and RNA binding

In [None]:
(
    merged.groupby('leiden').mean().reset_index()
    [['leiden', 'mean_score', 'gravy', 'is_rna_bp']]
    .rename(
        columns={
            'mean_score': 'mean_iupred_score',
            'leiden': 'leiden_cluster_id',
            'gravy': 'mean_hydrophobicity_score',
            'is_rna_bp': 'percent_rna_bp'
        },
   )
    # .to_csv(output_dir / '2021-09-30-all-within-cluster-means.csv', index=False)
)

### Export the lists of true and shuffled within-cluster means

In [None]:
pd.Series(true_means, name='true_clusters').to_csv(
    output_dir / ('%s-within-cluster-means--%s--true-clusters.csv' % (timestamp, col)), 
    index=False
)
pd.Series(shuffled_means, name='shuffled_clusters').to_csv(
    output_dir / ('%s-within-cluster-means--%s--shuffled-clusters.csv' % (timestamp, col)), 
    index=False
)

### Scatterplot of within-cluster means of disorder score and gravy index

In [None]:
cluster_id = 'leiden'
# cluster_id = 'mcl_community_id'

cols = ['gravy', 'mean_score', 'is_rna_bp']

tmp = merged.copy()

true_means = [
    tmp.groupby(cluster_id).mean()[col].values for col in cols
]
true_means = pd.DataFrame(np.array(true_means).transpose(), columns=cols)

shuffled_means = [[] for _ in cols]
for _ in range(100):
    cluster_ids = tmp[cluster_id].values
    np.random.shuffle(cluster_ids)
    tmp[cluster_id] = cluster_ids
    for ind, col in enumerate(cols):
        shuffled_means[ind].extend(list(tmp.groupby(cluster_id).mean()[col].values))

shuffled_means = pd.DataFrame(np.array(shuffled_means).transpose(), columns=cols)

plt.figure(figsize=(6,6))
sns.scatterplot(
    x=shuffled_means.gravy, y=shuffled_means.mean_score, alpha=0.1, color='#999', edgecolor=None, s=3
)
sns.scatterplot(x=true_means.gravy, y=true_means.mean_score, alpha=1, color='red', s=7)

plt.xlabel('Hydrophobicity')
plt.ylabel('IUPRED2A score')

### Aside: calculate the Gini index for the true and shuffled distribution of within-cluster means

In [None]:
def mean_abs_diff(vals):
    vals = np.array(vals)
    s, n = 0, 0
    _n = len(vals)
    for val in vals:
        s += np.abs(val - vals).sum()
        n += _n
    return s/n/(2*vals.mean())

In [None]:
# sanity check for uniform distribution
a_vals = [0, 1, 7]
b_vals = [1, 3, 11]
for a in a_vals:
    for b in b_vals:
        if a > b:
            continue
        theory_gini = (b - a) / (3 * (a + b))
        estimated_gini = mean_abs_diff(np.random.rand(10000)*(b - a) + a)
        print((a, b, theory_gini, estimated_gini))

In [None]:
mean_abs_diff(true_means), mean_abs_diff(shuffled_means)