In [10]:
from sklearn.cluster import OPTICS
from sklearn.manifold import Isomap
from sklearn.manifold import MDS
import numpy as np
from google.cloud import storage
from google.cloud.storage import Blob
from numpy import genfromtxt
import os
import pandas as pd
from utils import model_and_evaluate_cluster as ev
import itertools
import time
from os.path import exists

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns; sns.set()
import plotly
import plotly.express as px

In [82]:
def download_file(path):
        data=[]
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "PSS GCS Storage Key.json"
        storage_client = storage.Client()
        blob = storage_client.get_bucket('capstone-fall21-protein').get_blob(path)
        blob.download_to_filename(path.split('/')[-1])
        
        
def load_model(df):
    home = '/home/jupyter/pss/'

    model = df
    model.query_protein = 'AF-' + model.query_protein.astype(str) + '-F1-model_v1'
    model.target_protein = 'AF-' + model.target_protein.astype(str) + '-F1-model_v1'
    
    batch_stats = pd.read_parquet('pairwise_evaluation_metrics.parquet')
    
    batch_stats.set_index(['query_protein', 'target_protein'], inplace=True)
    model.set_index(['query_protein', 'target_protein'], inplace=True)
    model_stats = model.join(batch_stats, on=['query_protein', 'target_protein'], how='left')
    
    return model_stats.reset_index()


pd.options.display.max_colwidth = 100
deepfold_nomask = [f'embeddings/DeepFold/withMask=false/embeddings_0{n}.csv' for n in range(0,10)]
deepfold_nomask.extend([f'embeddings/DeepFold/withMask=false/embeddings_{n}.csv' for n in range(10,21)])
deepfold_mds100 = 'embeddings/DeepFold/MDS100/deepfold_mds100_embeddings.csv'

In [None]:
# DEEPFOLD EMBEDDINGS

X, y, p = [], [], []

for file in deepfold_mds100:
    tX, ty, tp = ev.deepfold_file_processor(file)
    X.extend(tX)
    y.extend(ty)
    p.extend(tp)
    
print(len(X), len(y), len(p))

In [5]:
# OPTIONAL - Eliminate Low AlphaFold Prediction Confidence Proteins
download_file('structure_files/sequences/sequences.parquet')
low_bois = pd.read_parquet('sequences.parquet')
low_bois = low_bois[low_bois.confidence_pLDDT < 50.]['protein_id'].values
tmpX = []
tmpy = []
for i, p in enumerate(X):
    if y[i] not in low_bois:
        tmpX.append(p)
        tmpy.append(y[i])
X = tmpX
y = tmpy
print(len(X), len(y))

19320 19320


In [None]:
embedding = MDS(n_components=100, random_state=42)
start = time.time()
X_transformed = embedding.fit_transform(X)
duration = int((time.time() - start) / 60)
print(f'Took {duation} mins. with shape {X_transformed.shape}')

In [28]:
X_transformed.shape

(19320, 100)

In [79]:
resl = [] 
for i, row in enumerate(pd.DataFrame(X_transformed).values):
    resl.append([row, y[i]])

res = pd.DataFrame(resl, columns=['deepfold', 'protein_id'])
res = res[res.protein_id != '']

In [81]:
res.to_csv('deepfold_mds100_embeddings.csv')

In [None]:
prefix = 'embeddings/DeepFold/withMask=false/'
keys = gcs.list_file_paths(prefix)[1:]
X_deepfold, missing_deepfold, protein_id_deepfold = ev.import_deepfold_embeddings(keys)


protein_conf = gcs.download_parquet('structure_files/proteins_and_confidences.parquet')
mask = np.isin(protein_id_deepfold,
               np.array(protein_conf[protein_conf.confidence!='D'].protein_id)
              )
protein_id_deepfold_ep = protein_id_deepfold[mask]
X_deepfold_ep = X_deepfold[mask]