In [1]:
import pandas as pd
from glob import glob as glob
from tqdm import tqdm_notebook
import json
import numpy as np
import natsort

In [2]:
files = glob('../SASA/P*')

model_map={}

for file in tqdm_notebook(files):
 
    with open(file, 'r') as f:
        pos_map = {}
        for line in f.readlines()[1:]:
            els = line.split()
            pos_map[els[1]+els[2]] = dict(AA=els[3],
                                          ACCESS=float(els[4]),
                                          RELACC=float(els[5]),
                                          SCACC=float(els[6]),
                                          SCRELACC=float(els[7]))
    model_map[file.split('/')[-1]] = pos_map




In [3]:
with open('/home/gil/AbPyTools/abpytools/data/NumberingSchemes.json') as f:
    data = json.load(f)['chothia']
    heavy_data = data['heavy']
    light_data = data['light']
    
    numbering_data = [*heavy_data, *light_data]

In [4]:
def vector_representation(data, column='RELACC', fillna=0, dtype='float'):
    """
    Function to represent accessibility data in matrix form
    
    :type data: dict
    :type column: str
    :type fillna: float/int
    
    :param data: data from clusterer
    :param column: key from where to extract data
    :param fillna: what value to use if position is missing
    
    :rtype: dict
    :return: returns dict with same keys as data containing a dict 
             with the vector for each position
             
    Explanation:
    ------------
    
    Each vector has the following format:
    
    [ACCESS H1, ACCESS H2, ..., ACCESS H114, ACCESS L1, ACCESS L2, ..., ACCESS L111]
        
    Each elemeent either takes up the value of 0 (no contact) or 1 (contact)
    
    """
    
    result = dict()
    
    if dtype == 'float':
        values = np.zeros((len(data), len(numbering_data)))
    elif dtype == 'str':
        values = np.empty((len(data), len(numbering_data)), dtype='U3')
    
    keys = list()
    
    for i, key in enumerate(tqdm_notebook(natsort.natsorted(data.keys()))):
        
        keys.append(key)
        
        value = data[key]
                
        for j, pos in enumerate(numbering_data):
            
            try:
                values[i][j] = value[pos][column]
            except KeyError:
                values[i][j] = fillna
                
    return pd.DataFrame(data=values, index=keys, columns=numbering_data)

In [5]:
access_matrix = vector_representation(model_map)




In [6]:
access_matrix.head()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H6A,H6B,H6C,H6D,...,L103,L104,L105,L106,L106A,L107,L108,L109,L110,L111
P1,101.956,39.551,37.539,4.537,58.441,2.681,0.0,0.0,0.0,0.0,...,61.742,2.028,54.978,20.001,0.0,61.857,74.667,117.715,0.0,0.0
P2,0.0,0.0,56.114,8.061,64.292,7.405,0.0,0.0,0.0,0.0,...,48.628,1.261,41.686,33.701,0.0,57.162,80.68,118.702,0.0,0.0
P3,0.0,22.0,18.131,46.082,23.055,57.408,0.0,0.0,0.0,0.0,...,25.178,13.521,30.22,39.92,0.0,22.593,30.108,16.105,0.0,0.0
P4,95.4,15.133,55.006,6.976,49.314,5.608,0.0,0.0,0.0,0.0,...,55.647,1.458,39.562,22.78,80.991,0.0,0.0,0.0,0.0,0.0
P5,112.23,32.542,43.849,3.425,42.332,3.417,0.0,0.0,0.0,0.0,...,61.497,2.011,53.95,20.04,0.0,62.315,74.019,118.391,0.0,0.0


In [7]:
amino_acids = vector_representation(model_map, 'AA', dtype='str')




In [8]:
amino_acids.head()

Unnamed: 0,H1,H2,H3,H4,H5,H6,H6A,H6B,H6C,H6D,...,L103,L104,L105,L106,L106A,L107,L108,L109,L110,L111
P1,GLU,VAL,LEU,LEU,LEU,GLU,0,0,0,0,...,LYS,LEU,GLU,ILE,0,LYS,ARG,THR,0,0
P2,0,0,GLN,LEU,VAL,GLN,0,0,0,0,...,LYS,VAL,GLU,ILE,0,LYS,ARG,THR,0,0
P3,0,VAL,GLN,LEU,VAL,GLU,0,0,0,0,...,LYS,VAL,GLU,ILE,0,LYS,ARG,THR,0,0
P4,GLN,VAL,GLN,LEU,GLN,GLN,0,0,0,0,...,LYS,LEU,THR,VAL,LEU,0,0,0,0,0
P5,GLU,VAL,GLN,LEU,VAL,GLU,0,0,0,0,...,LYS,LEU,GLU,ILE,0,LYS,ARG,THR,0,0


In [9]:
neg_charges = ((amino_acids == 'ASP') | (amino_acids == 'GLU')) * -1

In [10]:
from sklearn.decomposition import PCA, TruncatedSVD
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import matplotlib
plt.style.use(['seaborn-white', 'seaborn-paper'])
matplotlib.rc("font", family="Times New Roman")

In [12]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from IPython.html.widgets import interactive, IntSlider
from IPython.display import display
   
def svd_demo(threshold=10):
    access_charge = neg_charges * (access_matrix < threshold)
    svd = TruncatedSVD()
    reduced = svd.fit_transform(access_charge.as_matrix())

    with sns.plotting_context(rc={"axes.titlesize":25,"axes.labelsize":20, 
                                  'xtick.labelsize': 10, 'ytick.labelsize': 15, 
                                  'legend.fontsize': 15}):
        
        plt.figure(figsize=(10,10))
        plt.xlabel('PC1')
        plt.ylabel('PC2')
        plt.scatter(reduced[:,0], reduced[:,1])
        plt.grid()
        plt.show()

w=interactive(svd_demo, threshold=IntSlider(min=5, max=100,step=5,value=50))
display(w)