In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn import svm

In [2]:
def MinMaxScaler(scale:dict, x:float):
    '''this function performs the min max rescaling, so to rescale the input value between 0 and 1. It requires as input a scale in the form of a dict'''
    
    min_x=min(scale.values())
    max_x=max(scale.values())
    
    return (x-min_x)/(max_x-min_x)

In [21]:
def ProteinScaler(seq:str,scale:dict,sw:int, k:int):
    '''This function takes as input a sequence, a scale and the length of the sliding window to calculate the feature (depending on the scale)
    of each residue in the sequence considering for the computation the surrounding residues'''
    
    seq=seq[0:k]
    #compute the half of the sliding window length to define later the sliding window
    if sw%2==0:
        d=int(sw/2)
    else:
        d=int((sw-1)/2)
    #define a list to hold the score of each residue
    score_list=[]
    #iterate trough the sequence so to calculate the score of each residue in the sequence considering the nearby residues with a sliding window
    for i in range(len(seq)):
        score=0 
    #if the residue is at the beginning or at the end shrink the sliding window so to unbyas the claculation
        if i - d < 0 :
            sliding_window=seq[0:i+d+1]
        elif i+d> len(seq):
            sliding_window=seq[i-d:len(seq)]
        else:
            sliding_window=seq[i-d:i+d+1]
    #sum the individual score of each residue in the sliding window and divide it by the length of the sliding window    
        for residue in sliding_window:
            score+=scale[residue]  
        score=score/len(sliding_window)
    #store the result    
        score_list.append(score)      
    
    return score_list

In [4]:
def AddScale1(scale):
    '''This function complets the scale by adding values for undefined chracters'''
    
    scale["X"]=0.0
    scale["B"]=(float(scale["D"])+float(scale["N"]))/2
    scale["Z"]=(float(scale["E"])+float(scale["Q"]))/2
    scale["J"]=(float(scale["I"])+float(scale["L"]))/2
    scale["U"]=float(scale["C"])
    scale["O"]=float(scale["L"])
    
    return scale

def AddScale2(aa_composition):
    '''This function complets the scale by adding values for undefined chracters'''
    
    aa_composition["X"]=0.0
    aa_composition["B"]=0.0
    aa_composition["Z"]=0.0
    aa_composition["J"]=0.0
    aa_composition["U"]=0.0
    aa_composition["O"]=0.0
    
    return aa_composition

In [5]:
hh = {"A": 1.8, "R": -4.5, "N": -3.5, "D": -3.5, "C": 2.5,
      "Q": -3.5, "E": -3.5, "G": -0.4, "H": -3.2, "I": 4.5,
      "L": 3.8, "K": -3.9, "M": 1.9, "F": 2.8, "P": -1.6,
      "S": -0.8, "T": -0.7, "W": -0.9, "Y": -1.3, "V": 4.2}

charge = {"A": 0, "R": 1, "N": 0, "D": 0, "C": 0,
          "Q": 0, "E": 0, "G": 0, "H": 1, "I": 0,
          "L": 0, "K": 1, "M": 0, "F": 0, "P": 0,
          "S": 0, "T": 0, "W": 0, "Y": 0, "V": 0}

alpha_helix={"A":  1.290,  "R":  0.960,  "N":  0.900,  "D":  1.040,  
            "C":  1.110,  "Q":  1.270,  "E":  1.440,  "G":  0.560,  
            "H":  1.220,  "I":  0.970,  "L":  1.300,  "K":  1.230,  
            "M":  1.470,  "F":  1.070,  "P":  0.520,  "S":  0.820,  
            "T":  0.820,  "W":  0.990,   "Y":  0.720, "V":  0.910  }

swiss_composition = {"A": 8.25, "R": 5.53, "N": 4.06, "D": 5.46, "C": 1.38,
                    "Q": 3.93, "E": 6.72, "G": 7.07, "H": 2.27, "I": 5.91,
                    "L": 9.65, "K": 5.80, "M": 2.41, "F": 3.86, "P": 4.74,
                    "S": 6.65, "T": 5.36, "W": 1.10, "Y": 2.92, "V": 6.85}

In [23]:
sequence = "MRLHRLRARLSAVACGLLLLLVRGQGQDSASPIRTTHTGQVLGSLVHVKGANAGVQTFLG"
#pa=ProteinAnalysis(sequence)
k=20

hp = ProteinScaler(sequence,hh,5,k)
av_hp=MinMaxScaler(hh, np.mean(hp))
max_hp=MinMaxScaler(hh, max(hp))
argmax_hp=(np.argmax(hp)+1)/k
print(len(hp))
print(hp)
print(av_hp,max_hp,argmax_hp)

ch = ProteinScaler(sequence,charge,5)
av_ch=MinMaxScaler(charge, np.mean(hp))
max_ch=MinMaxScaler(charge, max(hp))
print(ch)

ah = ProteinScaler(sequence,alpha_helix,5)
print(ah)

sc = ProteinScaler(sequence,swiss_composition,5)
print(sc)

20
[0.3999999999999999, -0.5000000000000001, -1.3, -0.9200000000000002, -0.9200000000000002, -1.32, -1.58, 0.07999999999999989, -0.8400000000000001, 0.41999999999999993, 0.9, 2.16, 1.9, 1.98, 2.38, 2.3, 2.7, 2.96, 3.8, 3.7999999999999994]
0.6022222222222222 0.9222222222222223 0.95


TypeError: ProteinScaler() missing 1 required positional argument: 'k'

In [7]:
scale_list=[hh,charge,alpha_helix,swiss_composition]
df=pd.DataFrame(scale_list)
df=df.T
df.columns=["Hydropathicity","charge","alpha helix tendency","aa composition (SW)"]
df

Unnamed: 0,Hydropathicity,charge,alpha helix tendency,aa composition (SW)
A,1.8,0.0,1.29,8.25
R,-4.5,1.0,0.96,5.53
N,-3.5,0.0,0.9,4.06
D,-3.5,0.0,1.04,5.46
C,2.5,0.0,1.11,1.38
Q,-3.5,0.0,1.27,3.93
E,-3.5,0.0,1.44,6.72
G,-0.4,0.0,0.56,7.07
H,-3.2,1.0,1.22,2.27
I,4.5,0.0,0.97,5.91


In [None]:
# Create a SVC with RBF kernel with gamma=0.3 and C=8
mySVC = svm.SVC(C=8.0, kernel='rbf', gamma=0.3)
# Train (fit) the model on training data
mySVC.fit(X, y)
# Predict classes on testing data
y_pred = mySVC.predict(X_test)
# Getting support vectors
sv = mySVC.support_vectors_
# Print the number of support vectors for each class
print mySVC.n_support_
# Getting decision function values on testdata
DF = mySVC.decision_function(X_test)
# Input data as list of lists
# len(X) == len(y)
X = [[0, 0], [0, 1], [1, 0], [1, 1]]
y = [0, 1, 1, 0]
# Build and train the model on data (X, y)

