### Imports

In [10]:
from hw2skeleton import cluster as cl
from hw2skeleton import io
import os
import pandas as pd
import numpy as np
aa3 = "ALA CYS ASP GLU PHE GLY HIS ILE LYS LEU MET ASN PRO GLN ARG SER THR VAL TRP TYR".split()

### Functions

In [11]:
def leven_dist(site_a, site_b):
    if len(site_a) > len(site_a):
        site_a, site_b = site_b, site_a
    dists = range(len(site_a) + 1)
    
    for i in site_b.index:
        nDists = [i +1]
        for j in site_a.index:
            aa_a = site_a.columns[site_a.iloc[j] == 1][0]
            aa_b = site_b.columns[site_b.iloc[i] == 1][0]
            if aa_a == aa_b:
                nDists.append(dists[j])
            else:
                m = min((dists[j], dists[j+1], nDists[-1]))
                nDists.append(1 + m)
        dists = nDists
    similarity = dists[-1]
    return similarity 

In [12]:
def calc_similarity_matrix(sites):
    """
    Calculate a complete matrix of similarities of all active sites to all others
        to be used to pull from in clustering so calculations only need to be done once
    Input: a list of ActiveSite instances
    Output: complete all by all matrix of levenstein distances between active sites
            formatted as a pandas DataFrame
            rows, columns = [0, 1, 2, ... n]

    """
    simMat = []
    for i in sites:
        row = []
        for j in sites:
            row.append(leven_dist(i,j))
        simMat.append(row)
    return pd.DataFrame(simMat)


def calc_avg_site_length(sites):
    ss = []
    for site in sites:
        ss.append(len(site.residues))
    return [sum(ss) / len(sites), max(ss), min(ss)]


def generate_random_site(sites):
    lens = calc_avg_site_length(sites)
    num_res = np.random.randint(lens[2],lens[1])
    site = pd.DataFrame(0, index = range(num_res), columns = aa3)
    
    for pos in site.index:
        aa = np.random.randint(0,19)
        site.iloc[pos,aa] = 1
        
    return site    

In [13]:
def compute_cluster_center_dumb(cluster_list, sites):
    total = pd.DataFrame(columns = aa3)
    for j in cluster_list:
        site = sites[sites==j].onehot
        for row in site.index:
            if len(total) <= row:
                total = total.append(site.iloc[row])
            else:
                total.iloc[row] += site.iloc[row]
    return total / len(cluster_list)

### Testing

In [14]:
sites = io.read_active_sites('data')

Read in 136 active sites


In [15]:
sites[0].residues

[ASP 165, ASP 167, SER 211, ARG 213, ASP 254, LYS 258, ASP 278]

In [16]:
s = sites[0]

In [17]:
sites[0].onehot

Unnamed: 0,ALA,CYS,ASP,GLU,PHE,GLY,HIS,ILE,LYS,LEU,MET,ASN,PRO,GLN,ARG,SER,THR,VAL,TRP,TYR
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
