# Looking for ortholog ID families with conserved compactness across lenght variations

In [1]:
# Imports
from Bio import SeqIO
import json
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr, spearmanr
from scipy.stats import f as f_dist
from scipy.optimize import curve_fit
import requests

sys.path.append('../../src')
import data_utils
import analyse_utils
import mdtraj as md

plt.rcParams["font.family"] = "Liberation Sans"
plt.rcParams["font.weight"] = "normal"



In [80]:
import itertools
from localcider.sequenceParameters import SequenceParameters
from residues import residues

# Loading SVR model
import joblib
model = joblib.load('svr_model.joblib')

def svr_features(seq,Nc=0,Cc=0,Hc=0.5,residues=residues.set_index('one')):
    """df: DataFrame to be populated with sequence properties
    r: DataFrame of aa-specific parameters"""
    seq = list(seq).copy()   
    fasta_kappa = np.array(seq.copy())
    N = len(seq)
    r = residues.copy()
    
    # calculate properties that do not depend on charges
    mean_lambda = np.mean(r.loc[seq].AH_lambda)

    pairs = np.array(list(itertools.combinations(seq,2)))
    pairs_indices = np.array(list(itertools.combinations(range(N),2)))
    # calculate sequence separations
    ij_dist = np.diff(pairs_indices,axis=1).flatten().astype(float)
    # calculate lambda sums
    ll = r.AH_lambda.loc[pairs[:,0]].values+r.AH_lambda.loc[pairs[:,1]].values
    # calculate SHD
    beta = -1
    shd = np.sum(ll*np.power(np.abs(ij_dist),beta))/N
    SeqOb = SequenceParameters(''.join(seq))
    
    # fix charges
    if Nc == 1:
        r.loc['X'] = r.loc[seq[0]]
        r.loc['X','q'] = r.loc[seq[0],'q'] + 1.
        seq[0] = 'X'
        if r.loc['X','q'] > 0:
            fasta_kappa[0] = 'K'
        else:
            fasta_kappa[0] = 'A'
    if Cc == 1:
        r.loc['Z'] = r.loc[seq[-1]]
        r.loc['Z','q'] = r.loc[seq[-1],'q'] - 1.
        seq[-1] = 'Z'
        if r.loc['Z','q'] < 0:
            fasta_kappa[-1] = 'D'
        else:
            fasta_kappa[-1] = 'A'
    if Hc < 0.5:
        r.loc['H', 'q'] = 0
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'A'
    elif Hc >= 0.5:
        r.loc['H', 'q'] = 1
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'K'
            
    # calculate properties that depend on charges
    pairs = np.array(list(itertools.combinations(seq,2)))
    # calculate charge products
    qq = r.q.loc[pairs[:,0]].values*r.q.loc[pairs[:,1]].values
    # calculate SCD
    scd = np.sum(qq*np.sqrt(ij_dist))/N
    SeqOb = SequenceParameters(''.join(fasta_kappa))
    kappa = SeqOb.get_kappa()
    fcr = r.q.loc[seq].abs().mean()

    return np.array([scd, shd, kappa, fcr, mean_lambda])

def predict_nu(seq):
    features = float(model.predict(features))
    nu = float(model.predict(features))
    return nu

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Data preparation

## Analysis