In [1]:
from utils import gcs_utils as gcs
from utils import model_and_evaluate_cluster as ev

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt

import importlib
import hdbscan
from sklearn.cluster import DBSCAN
import io
import os
import pickle
import datetime


from sklearn.neighbors import NearestNeighbors
import seaborn as sns
sns.set()

from sklearn.cluster import KMeans
from kneed import KneeLocator
import copy
from sklearn.manifold import TSNE 

import matplotlib.colors as mcolors
from matplotlib import cm 


In [2]:
prefix = 'embeddings/DeepFold/withMask=false/'
keys = gcs.list_file_paths(prefix)[1:]

In [3]:
X_deepfold, missing_deepfold, protein_id_deepfold = ev.import_deepfold_embeddings(keys)

embeddings/DeepFold/withMask=false/embeddings_00.csv
embeddings/DeepFold/withMask=false/embeddings_01.csv
embeddings/DeepFold/withMask=false/embeddings_02.csv
embeddings/DeepFold/withMask=false/embeddings_03.csv
embeddings/DeepFold/withMask=false/embeddings_04.csv
embeddings/DeepFold/withMask=false/embeddings_05.csv
embeddings/DeepFold/withMask=false/embeddings_06.csv
embeddings/DeepFold/withMask=false/embeddings_07.csv
embeddings/DeepFold/withMask=false/embeddings_08.csv
embeddings/DeepFold/withMask=false/embeddings_09.csv
embeddings/DeepFold/withMask=false/embeddings_10.csv
embeddings/DeepFold/withMask=false/embeddings_11.csv
embeddings/DeepFold/withMask=false/embeddings_12.csv
embeddings/DeepFold/withMask=false/embeddings_13.csv
embeddings/DeepFold/withMask=false/embeddings_14.csv
embeddings/DeepFold/withMask=false/embeddings_15.csv
embeddings/DeepFold/withMask=false/embeddings_16.csv
embeddings/DeepFold/withMask=false/embeddings_17.csv
embeddings/DeepFold/withMask=false/embeddings_

In [4]:
print(X_deepfold.shape)
print(protein_id_deepfold.shape)

(20301, 398)
(20301,)


In [5]:
protein_conf = gcs.download_parquet('structure_files/proteins_and_confidences.parquet')

mask = np.isin(protein_id_deepfold,
               np.array(protein_conf[protein_conf.confidence!='D'].protein_id)
              )

protein_id_deepfold_ep = protein_id_deepfold[mask]
X_deepfold_ep = X_deepfold[mask]

print(mask.shape)
print(protein_id_deepfold.shape)
print(protein_id_deepfold_ep.shape)
print(X_deepfold.shape)
print(X_deepfold_ep.shape)

(20301,)
(20301,)
(19325,)
(20301, 398)
(19325, 398)


In [16]:
a1ep_model = DBSCAN(eps=0.0500, 
                  min_samples=2,
                  metric='cosine').fit(X_deepfold_ep)

  return f(*args, **kwargs)


In [7]:
b1ep_model = hdbscan.HDBSCAN(algorithm='generic', 
                                     alpha= 1.0, 
                                     approx_min_span_tree=True,
                                     gen_min_span_tree=False, 
                                     leaf_size=40, 
                                     metric='cosine', 
                                     min_cluster_size= 5, 
                                     min_samples= 1, 
                                     p=None)
b1ep_model.fit(X_deepfold_ep)

  return f(*args, **kwargs)


HDBSCAN(algorithm='generic', metric='cosine', min_samples=1)

In [8]:
b1epa_model = hdbscan.HDBSCAN(algorithm='generic', 
                                     alpha= 1.0, 
                                     approx_min_span_tree=True,
                                     gen_min_span_tree=False, 
                                     leaf_size=40, 
                                     metric='cosine', 
                                     min_cluster_size= 5, 
                                     min_samples= 5, 
                                     p=None)
b1epa_model.fit(X_deepfold_ep)

  return f(*args, **kwargs)


HDBSCAN(algorithm='generic', metric='cosine', min_samples=5)

In [9]:
b1epb_model = hdbscan.HDBSCAN(algorithm='generic', 
                                     alpha= 1.0, 
                                     approx_min_span_tree=True,
                                     gen_min_span_tree=False, 
                                     leaf_size=40, 
                                     metric='cosine', 
                                     min_cluster_size= 10, 
                                     min_samples= 1, 
                                     p=None)
b1epb_model.fit(X_deepfold_ep)

  return f(*args, **kwargs)


HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=1)

In [10]:
b1epc_model = hdbscan.HDBSCAN(algorithm='generic', 
                                     alpha= 1.0, 
                                     approx_min_span_tree=True,
                                     gen_min_span_tree=False, 
                                     leaf_size=40, 
                                     metric='cosine', 
                                     min_cluster_size= 10, 
                                     min_samples= 5, 
                                     p=None)
b1epc_model.fit(X_deepfold_ep)

  return f(*args, **kwargs)


HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=5)

In [11]:
models = [('a1ep_model', a1ep_model),
    ('b1ep_model', b1ep_model),

    ('b1epa_model', b1epa_model),
    ('b1epb_model', b1epb_model),
    ('b1epc_model', b1epc_model)]

In [12]:
for model_name, model in models:
    file_path = 'model_outputs/no_cluster_size_limit/deepfold_redo/'+model_name +'.pkl'
    with open(file_path, 'wb') as file:
        pickle.dump(model, file)

In [19]:
asp = ev.download_asp()

In [21]:
# Download sequences info 
sequences = gcs.download_parquet("structure_files/sequences/sequences.parquet")
sequences["seq_len"] = sequences["pdbx_seq_one_letter_code"].str.len()

In [22]:
prefix = 'model_outputs/no_cluster_size_limit/deepfold_redo/'


models = {
          'A1EP': ('DBSCAN' , 'DeepFold_EP',  a1ep_model,  protein_id_deepfold_ep, X_deepfold_ep),
          'B1EPA': ('HDBSCAN', 'DeepFold_EP', b1epa_model, protein_id_deepfold_ep, X_deepfold_ep),
          'B1EPC': ('HDBSCAN', 'DeepFold_EP', b1epc_model, protein_id_deepfold_ep, X_deepfold_ep),

      }

for model_code, vals in models.items():
    algo, embed, model, protein_id, X = vals 

    print (model, "started processing: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    # Model stats including standard cluster similarity eval 
    model_overview_stats = ev.model_overview(model, X)
    with open(prefix + model_code + '-' + algo + '-' + embed + '-model_overview.pkl', 'wb') as file:
        pickle.dump(model_overview_stats, file)
    print(model_overview_stats)
        
    # Do more eval
    clusters = pd.DataFrame({'protein':      protein_id, 
                            'cluster_label': model.labels_})
    
    with open(prefix+ model_code +'_model.pkl', 'wb') as file:
        pickle.dump(model, file)
    
    with open(prefix+ model_code +'_clusters.pkl', 'wb') as file:
        pickle.dump(clusters, file)
    
    
    started_combos = datetime.datetime.now()
    print (model, "- started combo generation: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # output all protein combos for TM Align calc 
    all_protein_combos_per_cluster = ev.find_all_protein_combos_per_cluster(clusters, 
                                                                            exclude_unclustered=True, 
                                                                            max_clus_size=99999,
                                                                            rand_seed=1710)
    
    print("Combo duration : ", datetime.datetime.now() - started_combos)
    all_protein_combos_per_cluster.to_parquet(prefix + model_code + '-' + algo + '-' + embed + "-all_protein_combos_per_cluster.parquet")
    
    print (model, "- started confidence level generation & seq stats: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    # Confidence level per protein 
    cluster_conf = ev.protein_confidence_agg(clusters, asp)
    
    # Sequence stats
    sequence_stats = ev.sequence_stats(clusters, sequences).set_index("cluster_label")
    
    cluster_stats = ev.merge_cluster_stats(cluster_conf, sequence_stats)
    
    # Reorder columns
    cluster_stats['model'] = algo
    cluster_stats['embedding'] = embed
    first_cols = ['model', 'embedding', 'num_proteins'] 
    ordered_cols = first_cols + list(cluster_stats.columns[~cluster_stats.columns.isin(first_cols)])
    ordered_cols
    cluster_stats = cluster_stats.reindex(columns=ordered_cols)

    cluster_stats.to_parquet(prefix + model_code + '-' + algo + '-' + embed + '-cluster_stats.parquet')
    
        
    print("Completed processing "+ model_code + '-' + algo + '-' + embed)
    print (model, "- completed processing: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    print()

    
now = datetime.datetime.now()
print ("Current date and time : ")
print (now.strftime("%Y-%m-%d %H:%M:%S"))

DBSCAN(eps=0.05, metric='cosine', min_samples=2) started processing:  2021-12-03 07:52:30


  return f(*args, **kwargs)
  return f(*args, **kwargs)


{'Model': "DBSCAN(eps=0.05, metric='cosine', min_samples=2)", 'Number of clusters categories (incl. noise)': 1222, 'Number of clusters (excl. noise)': 1221, 'Noise': 9923, 'Largest non-noise cluster': 4571, 'Noise as % of total': 0.5134799482535576, 'Noise and largest cluster as % of total': 0.750012936610608, 'Silhouette score': -0.0003605732425635734, 'DB score': 0.9327115232261285}
DBSCAN(eps=0.05, metric='cosine', min_samples=2) - started combo generation:  2021-12-03 07:52:36
Combo duration :  0:13:16.110444
DBSCAN(eps=0.05, metric='cosine', min_samples=2) - started confidence level generation & seq stats:  2021-12-03 08:05:57
Completed processing A1EP-DBSCAN-DeepFold_EP
DBSCAN(eps=0.05, metric='cosine', min_samples=2) - completed processing:  2021-12-03 08:06:03

HDBSCAN(algorithm='generic', metric='cosine', min_samples=5) started processing:  2021-12-03 08:06:03


  return f(*args, **kwargs)
  return f(*args, **kwargs)


{'Model': "HDBSCAN(algorithm='generic', metric='cosine', min_samples=5)", 'Number of clusters categories (incl. noise)': 234, 'Number of clusters (excl. noise)': 233, 'Noise': 15414, 'Largest non-noise cluster': 308, 'Noise as % of total': 0.7976196636481242, 'Noise and largest cluster as % of total': 0.8135575679172057, 'Silhouette score': 0.42410288643883143, 'DB score': 0.9138225279596784, 'Length of embedding': 398}
HDBSCAN(algorithm='generic', metric='cosine', min_samples=5) - started combo generation:  2021-12-03 08:06:05
Combo duration :  0:00:00.857618
HDBSCAN(algorithm='generic', metric='cosine', min_samples=5) - started confidence level generation & seq stats:  2021-12-03 08:06:06
Completed processing B1EPA-HDBSCAN-DeepFold_EP
HDBSCAN(algorithm='generic', metric='cosine', min_samples=5) - completed processing:  2021-12-03 08:06:10

HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=5) started processing:  2021-12-03 08:06:10


  return f(*args, **kwargs)
  return f(*args, **kwargs)


{'Model': "HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,\n        min_samples=5)", 'Number of clusters categories (incl. noise)': 121, 'Number of clusters (excl. noise)': 120, 'Noise': 15191, 'Largest non-noise cluster': 754, 'Noise as % of total': 0.7860802069857697, 'Noise and largest cluster as % of total': 0.8250970245795601, 'Silhouette score': 0.40334519037732136, 'DB score': 0.9634986755259322, 'Length of embedding': 398}
HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=5) - started combo generation:  2021-12-03 08:06:12
Combo duration :  0:00:00.711368
HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=5) - started confidence level generation & seq stats:  2021-12-03 08:06:13
Completed processing B1EPC-HDBSCAN-DeepFold_EP
HDBSCAN(algorithm='generic', metric='cosine', min_cluster_size=10,
        min_samples=5) - completed processing:  2021-12-03 08:06:17

Current date and time : 
2021

In [24]:
home_path = os.getcwd()

In [27]:
for file in os.listdir(home_path + '/' + prefix):
    print("Upload: "+prefix + file)
    gcs.upload_blob(home_path+ '/' + prefix + file, prefix + file)

Upload: model_outputs/no_cluster_size_limit/deepfold_redo/B1EPC-HDBSCAN-DeepFold_EP-model_overview.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/A1EP-DBSCAN-DeepFold_EP-cluster_stats.parquet
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/B1EPC_clusters.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/B1EPC-HDBSCAN-DeepFold_EP-cluster_stats.parquet
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/A1EP_clusters.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/a1ep_model.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/b1epa_model.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/A1EP_noise_stats.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/b1epb_model.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/A1EP-DBSCAN-DeepFold_EP-model_overview.pkl
Upload: model_outputs/no_cluster_size_limit/deepfold_redo/B1EPA-HDBSCAN-DeepFold_EP-all_protein_combos_per_cluster.parquet
Upload: mo

In [29]:
X_deepfold_ep.shape

(19325, 398)

In [32]:
num_proteins = 19325
for model in [a1ep_model ,b1epa_model ,b1epc_model]:
    overview = ev.model_overview(model, X_deepfold_ep)

    print(
        str(overview['Noise']) + '\t' + 
        str(overview['Largest non-noise cluster']) + '\t' + 
        str(num_proteins) + '\t' + 
        str(overview['Noise as % of total']) + '\t' + 
        str(overview['Noise and largest cluster as % of total']) + '\t' + 
        str(overview['Number of clusters (excl. noise)']) + '\t' + 
        str(overview['Silhouette score']) + '\t' + 
        str(overview['DB score']) 
    )

  return f(*args, **kwargs)
  return f(*args, **kwargs)


9923	4571	19325	0.5134799482535576	0.750012936610608	1221	-0.0003605732425635734	0.9327115232261285


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


15414	308	19325	0.7976196636481242	0.8135575679172057	233	0.42410288643883143	0.9138225279596784
15191	754	19325	0.7860802069857697	0.8250970245795601	120	0.40334519037732136	0.9634986755259322


  return f(*args, **kwargs)


In [None]:
9923	4571	19325	0.5134799482535576	0.750012936610608	1221	-0.0003605732425635734	0.9327115232261285
15414	308	19325	0.7976196636481242	0.8135575679172057	233	0.42410288643883143	0.9138225279596784
15191	754	19325	0.7860802069857697	0.8250970245795601	120	0.40334519037732136	0.9634986755259322