# Dataset Analysis

In [None]:
import pickle
import numpy as np
import pandas as pd
from Bio.PDB import PDBParser
import matplotlib.pyplot as plt

In [None]:
dataProtein = pd.read_csv('../dataset/database.tsv', sep='\t')
dataProtein.head()

In [None]:
print(f'Protein Identified with UniProt ID: {len(set(dataProtein["uniprot_id"]))}')
print(f'Protein Identified with PDB ID: {len(set(dataProtein["pdb_id"]))}')

In [None]:
multipleProtein = dataProtein.groupby('uniprot_id').filter(lambda x: len(x) >= 2)
multipleID = multipleProtein['uniprot_id'].unique()

print(f'MultipleID: {len(multipleID)}')
for id in multipleID:
    valori = multipleProtein[multipleProtein['uniprot_id'] == id]['pdb_id'].unique()
    print(f"UniProt ID: {id}, PDB ID: {', '.join(valori)}")

## Dimensionality Analysis

In [None]:
data = pd.read_pickle('../embedding/fastaEmb_wt/1a9n_C.embeddings.pkl')
data['P09661'].shape

In [None]:
dist = np.load('../embedding/distmap_wt/1a9n_C.distmap.npy')
dist.shape

In [None]:
with open('../dataset/fasta/1a9n_C.fasta', 'r') as fasta:
    fasta_wt = fasta.read()

sequence = fasta_wt.split('\n')[1]

In [None]:
len(sequence)

In [None]:
p = PDBParser(PERMISSIVE=1)
structure = p.get_structure('', '../dataset/pdb/1a9n_C.pdb')
model = structure[0]
residues = []
count = 0

for chain in model:
    for residue in chain:
        residues.append(residue)
        count += 1

count

## Similarity Analysis - UniRep

In [None]:
with open('../embedding/additional_features/unirep.pkl', 'rb') as file:
    unirep = pickle.load(file)

In [None]:
x = range(unirep['1unp_A'][0].shape[1])

plt.figure(figsize=(10,10))
plt.plot(x, unirep['1unp_A'][0].T)
plt.xlabel('Index')
plt.ylabel('Embedding')
plt.title('UniRep Embedding - 1UNP:A')
plt.grid(True)
plt.show()

## Similarity Analysis - Part 2

In [None]:
similarity = np.load('../embedding/additional_features/similarity.npy', allow_pickle=True)

In [None]:
similarity.item()['1unp_A'].keys()

In [None]:

for key in similarity.item()['1unp_A'].keys():
    x = range(similarity.item()['1unp_A'][key][1].mean(0).shape[0])

    plt.figure(figsize=(30,10))
    plt.plot(x, np.abs(similarity.item()['1unp_A'][key][1]).mean(0))
    plt.xlabel('Index')
    plt.ylabel('Embedding')
    plt.title(f'Embedding ELMo - {key}')
    plt.xticks(np.arange(0,1024, 14))
    plt.grid(True)
    plt.show()

In [None]:
matrix_similarity = similarity.item()['1unp_A']['1unp_A_A'][0]

plt.figure(figsize=(10,10))
plt.imshow(matrix_similarity, cmap='hot', interpolation='nearest')
plt.colorbar()
plt.title('Cosine Similarity Matrix - 1UNP:A')
plt.xlabel('Index')
plt.ylabel('Index')
plt.show()