<a href="https://colab.research.google.com/github/fpesceKU/EnsembleLab/blob/main/nuSVR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title <b>Preliminary operations</b>
import subprocess
subprocess.run( 'pip install wget localcider==0.1.18'.split() )
subprocess.run('pip uninstall scikit-learn -y'.split())
subprocess.run('pip install scikit-learn==1.0.2'.split())
import numpy as np
import itertools
from localcider.sequenceParameters import SequenceParameters
import wget
import sys
import os
from joblib import dump, load
import pandas as pd
from google.colab import files

def calc_seq_prop(seq,residues,Nc,Cc,Hc):
    seq = list(seq).copy()
    fasta_kappa = np.array(seq.copy())
    N = len(seq)
    r = residues.copy()

    # calculate properties that do not depend on charges
    fK = sum([seq.count(a) for a in ['K']])/N
    fR = sum([seq.count(a) for a in ['R']])/N
    fE = sum([seq.count(a) for a in ['E']])/N
    fD = sum([seq.count(a) for a in ['D']])/N
    faro = sum([seq.count(a) for a in ['W','Y','F']])/N
    mean_lambda = np.mean(r.loc[seq].lambdas)

    pairs = np.array(list(itertools.combinations(seq,2)))
    pairs_indices = np.array(list(itertools.combinations(range(N),2)))
    # calculate sequence separations
    ij_dist = np.diff(pairs_indices,axis=1).flatten().astype(float)
    # calculate lambda sums
    ll = r.lambdas.loc[pairs[:,0]].values+r.lambdas.loc[pairs[:,1]].values
    # calculate SHD
    beta = -1
    shd = np.sum(ll*np.power(np.abs(ij_dist),beta))/N
    SeqOb = SequenceParameters(''.join(seq))
    omega = SeqOb.get_kappa_X(grp1=['F','Y','W'])

    # fix charges
    if Nc == 1:
        r.loc['X'] = r.loc[seq[0]]
        r.loc['X','q'] = r.loc[seq[0],'q'] + 1.
        seq[0] = 'X'
        if r.loc['X','q'] > 0:
            fasta_kappa[0] = 'K'
        else:
            fasta_kappa[0] = 'A'
    if Cc == 1:
        r.loc['Z'] = r.loc[seq[-1]]
        r.loc['Z','q'] = r.loc[seq[-1],'q'] - 1.
        seq[-1] = 'Z'
        if r.loc['Z','q'] < 0:
            fasta_kappa[-1] = 'D'
        else:
            fasta_kappa[-1] = 'A'
    if Hc < 0.5:
        r.loc['H', 'q'] = 0
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'A'
    elif Hc >= 0.5:
        r.loc['H', 'q'] = 1
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'K'

    # calculate properties that depend on charges
    pairs = np.array(list(itertools.combinations(seq,2)))
    # calculate charge products
    qq = r.q.loc[pairs[:,0]].values*r.q.loc[pairs[:,1]].values
    # calculate SCD
    scd = np.sum(qq*np.sqrt(ij_dist))/N
    SeqOb = SequenceParameters(''.join(fasta_kappa))
    kappa = SeqOb.get_kappa()
    fcr = r.q.loc[seq].abs().mean()
    ncpr = r.q.loc[seq].mean()

    return np.around([fK, fR, fE, fD, faro, mean_lambda, shd, omega, scd, kappa, fcr, ncpr],3)

aa = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

if os.path.exists('svr_model.joblib') == False:
    wget.download('https://github.com/KULL-Centre/_2023_Tesei_IDRome/blob/main/svr_model/svr_model.joblib?raw=true')
if os.path.exists('residues.csv') == False:
    wget.download('https://raw.githubusercontent.com/KULL-Centre/CALVADOS/main/residues.csv')
model = load('svr_model.joblib')
residues = pd.read_csv('residues.csv',index_col='one')

fasta_dict = {}
df = pd.DataFrame(columns=['nu_SVR','fK','fR','fE','fD','fARO','Mean_lambda','SHD','Omega_ARO','SCD','kappa','FCR','NCPR'])

In [None]:
#@title <b>Upload sequence(s)</b>
#@markdown Upload fasta file. File with multiple sequences and multiple files upload is supported.
current_upload = []
fasta_file = files.upload()
for fn in fasta_file.keys():
    fasta = open(fn).readlines()
    try:
        fasta.remove("")
    except:
        pass
    for l in fasta:
        if l.startswith('>'):
            name = l[1:].strip()
            fasta_dict[name] = ''
            current_upload.append(name)
        else:
            fasta_dict[name] += l.strip()

#check sequence
for x in current_upload:
    for a in fasta_dict[x]:
        if a not in aa:
            print('WARNING: {} sequence contains a character ({}) not recognized as an aminoacid. This sequence will be ignored.'.format(x,a))
            del fasta_dict[x]
            break

In [None]:
#@title <b>Input sequence(s)</b>
#@markdown Or paste a sequence and provide a name. This cell can be executed multiple times to register more sequences.
NAME = "" #@param {type:"string"}
SEQUENCE = "" #@param {type:"string"}

if NAME != "" and SEQUENCE != "":
    if " " in SEQUENCE:
        SEQUENCE = ''.join(SEQUENCE.split())
    fasta_dict[NAME] = SEQUENCE

#check sequence
for a in fasta_dict[NAME]:
    if a not in aa:
        print('WARNING: {} sequence contains a character not recognized as an aminoacid. This sequence will be ignored'.format(name))
        del fasta_dict[NAME]

else:
    print('No NAME and/or SEQUENCE provided. Upload fasta files with the cell above or paste a sequence at the time here.')

In [None]:
#@title <b>Predict $\nu$
#@markdown Use this cell to calculate sequence features and predict the scaling exponent $\nu$. Results will be download in a csv file.

for k in fasta_dict.keys():
    res = calc_seq_prop(fasta_dict[k],residues,1,1,0)
    features = np.array([res[8],res[6],res[9],res[10],res[5]])
    nu = np.around(model.predict(features.reshape(1, -1)),3)
    df.loc[k] = np.concatenate((nu,res))

df.to_csv('nupred.csv')
files.download('nupred.csv')

df