# Get structural properties from RaptorX

## Define functions to calculate the structural properties:

In [None]:
import numpy as np
import pandas as pd

Functions to get the average of different properties over a window

In [None]:
def get_avg(df:pd.DataFrame, pos:int, wlen:int) -> tuple:
    """
    Calculate the average of a given property on a residue window of size
    `wlen`. For odd-numbered windows, take the elements centered around `pos`.
    For even-numbered windows, take the elements with `pos` in the `wlen`/2 position.
    
    E.g. in a window of 9 elements centered around 5, `pos` will be the 5th element:
    1 2 3 4 *5* 6 7 8 9
    
    And in a window of 10 elements centered around 5, `pos` will also be the 5th element:
    1 2 3 4 *5* 6 7 8 9 10

    
    Args:
        df (DataFrame): DataFrame with the structural properties to calculate
            the averages
        pos (int): Position on which the window will be centered
        wlen (int): Window length to calculate the average

    Returns:
        tuple: Average of all the properties over the defined window
    """
    
    half_window_down = np.ceil(wlen/2)-1
    half_window_up = np.ceil((wlen+1)/2)-1
    
    # Convert from 1-based numbering to 0-based
    inf=int(pos-(half_window_down+1))
    sup=int(pos+half_window_up)
    
    # If the window limits go beyond the sequence limits,
    # return the first or the last `wlen` amino acids
    seq_length = len(df)
    if inf < 0:
        inf=0
        sup=wlen
    elif sup > seq_length:
        inf=seq_length-wlen
        sup=seq_length
    
    
    diso = df.iloc[inf:sup].diso_prob.mean()
    tm2 = df.iloc[inf:sup].tm2_prob.mean()
    hss3 = df.iloc[inf:sup].H_prob_ss3.mean()
    ess3 = df.iloc[inf:sup].E_prob_ss3.mean()
    css3 = df.iloc[inf:sup].C_prob_ss3.mean()
    bacc = df.iloc[inf:sup].B_prob_acc.mean()
    macc = df.iloc[inf:sup].M_prob_acc.mean()
    eacc = df.iloc[inf:sup].E_prob_acc.mean()
    
    return diso,tm2,hss3,ess3,css3,bacc,macc,eacc


columns = ['diso_prob',
           'tm2_prob',
           'H_prob_ss3',
           'E_prob_ss3',
           'C_prob_ss3',
           'B_prob_acc',
           'M_prob_acc',
           'E_prob_acc']


def calculate_window(gene:str, positions:list, wlen:int) -> pd.DataFrame:
    """
    Calculate the average of `prop` for all the variants in the given series,
    for a single gene
    
    Args:
        gene (str): Gene to calculate properties for
        variants (list): List of integers with the residue positions of the variants
        wlen (int): Window length

    Returns:
        dict: Dictionary with lists of values for each variant
    """
    properties = pd.read_pickle(f'/ibex/scratch/projects/c2102/results/raptorx/{gene}.pkl')
    
    values = []
    
    for pos in positions:
        values.append(get_avg(properties, pos, wlen))
        
    
    return pd.DataFrame(values, columns=columns)

Calculate values for a single gene (Uniprot ID), giving a list of amino acids and window length as inputs.

In [None]:
genes = []
positions = []

In [None]:
calculate_window('SKA3_HUMAN', , 5)