# Construction de H

## Ancien code

In [1]:
import pandas as pd
import numpy as np

class CarboneAlpha:
    """Classe pour représenter un carbone alpha d'une protéine."""
    
    def __init__(self, number, x, y, z):
        self.number = number
        self.x = x
        self.y = y
        self.z = z
    
    def compute_distance(self, other):
        dist = ((other.x - self.x) ** 2 + (other.y - self.y) ** 2 + (other.z - self.z) ** 2) ** 0.5
        return dist

class Template:
    """Classe pour représenter le template utilisé."""
    
    def __init__(self, file):
        self.structure = self.build_template_from_pdb(file)
        self.length = len(self.structure)

    def build_template_from_pdb(self, filename):
        list_calpha = []
        with open(filename, "r") as pdb :
            for ligne in pdb:
                if ligne.startswith("ATOM") and (ligne[12:16].strip() == "CA"):
                    number = ligne[6:11].strip()
                    x = float(ligne[30:38].strip())
                    y = float(ligne[38:46].strip())
                    z = float(ligne[46:54].strip())
                                       
                    list_calpha.append(CarboneAlpha(number, x, y, z))
        return list_calpha
        
    def build_dist_matrix(self):
        dist_list = []
        
        for i, atom in enumerate(self.structure):
            dist_ligne = []
            for other in (self.structure):
                dist_ligne.append(atom.compute_distance(other))
            dist_list.append(dist_ligne)
            
        dist_matrix = np.array(dist_list)
        return dist_matrix
    
    def __str__(self):
        string = ""
        for i, ca in enumerate(self.structure):
            string += f"position {i}-{ca.number}, coor( {ca.x}, {ca.y}, {ca.z})\n"
        return string

def clean_DOPE_data(filename):
    ca_matrix = []
    
    with open(filename, "r") as dope :
        for ligne in dope:
            if ligne[3:7].strip() == "CA" and ligne[11:14].strip() == "CA":
                ca_matrix.append(ligne.split())
    
    columns = ['res1', 'temp1', 'res2', 'temp2'] + list(np.arange(0.25, 15, 0.5))
    dope_score = pd.DataFrame(ca_matrix, columns = columns) 
    dope_score = dope_score.drop(['temp1', 'temp2'], axis=1)
    
    return dope_score

class DynamicMatrix:    
    def __init__(self, lines, columns, gap):
        self.matrix = np.zeros((lines, columns))
        self.lines = lines
        self.columns = columns
        self.gap = gap

    def initialize_matrix(self, first_val, start, end, get_score):
        if (start[0] < 0) or (start[1] < 0):
            raise ValueError("Start of initialization out of matrix.")
        if (end[0] >= self.lines) or (end[1] >= self.columns):
            raise ValueError("End of initialization out of matrix.")
        
        # Première case
        self.matrix[start[0], start[1]] = first_val
        
        # Remplissage de la première colonne jusqu'à la limite
        for i in range(start[0] + 1, end[0] + 1):
            self.matrix[i, start[1]] = self.matrix[i - 1, start[1]] + self.gap + get_score(i, start[1])

        # Remplissage de la première ligne jusqu'à la limite
        for j in range(start[1] + 1, end[1] + 1):
            self.matrix[start[0], j] = self.matrix[start[0], j - 1] + self.gap + get_score(start[0], j)

class LowLevelMatrix(DynamicMatrix):
    aa_codes = {
    'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS',
    'Q': 'GLN', 'E': 'GLU', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE',
    'L': 'LEU', 'K': 'LYS', 'M': 'MET', 'F': 'PHE', 'P': 'PRO',
    'S': 'SER', 'T': 'THR', 'W': 'TRP', 'Y': 'TYR', 'V': 'VAL'
    }
    
    def __init__(self, gap, frozen, distance, dope, sequence):
        lines = len(sequence)
        columns = len(distance)
        
        DynamicMatrix.__init__(self, lines, columns, gap)

        # Vérification du blocage de la case
        if (frozen['seq_id'] >= lines) or (frozen['seq_id'] < 0):
            raise ValueError("Frozen line index out of matrix.")
        if (frozen['pos_id'] >= columns) or (frozen['pos_id'] < 0):
            raise ValueError("Frozen column index out of matrix")

        # Récupération du résidu fixé
        frozen['seq_res'] = sequence[frozen['seq_id']]
        
        self.frozen = frozen
        self.distance = distance
        self.dope = dope
        self.sequence = sequence

    def round_distance(self, dist):
        # arrondi au quart le plus proche
        rounded_value = round(dist * 4) / 4
        
        # ne garde que 0.25 ou 0.75
        decimal = rounded_value % 1
        if decimal == 0.0:
            return rounded_value + 0.25
        elif decimal == 0.5:
            return rounded_value + 0.25
        else:
            return rounded_value
    
    def get_score(self, i, j):
        dist = self.distance[self.frozen["pos_id"], j]
        closest_dist = self.round_distance(dist)

        score = self.dope.loc[(self.dope['res1'] == self.aa_codes[self.frozen['seq_res']]) & 
                              (self.dope['res2'] == self.aa_codes[self.sequence[i]]), 
                              closest_dist]
        
        return float(score.values[0])
    
    def fill_matrix(self):
        # Partie supérieure gauche
        self.initialize_matrix(self.get_score(0, 0), [0, 0], 
                               [self.frozen['seq_id'] - 1, self.frozen['pos_id'] - 1],
                               self.get_score)

        for i in range(1, self.frozen['seq_id']):
            for j in range(1, self.frozen['pos_id']):
                score = self.get_score(i, j)
                self.matrix[i, j] = score + min(self.matrix[i - 1, j - 1],
                                                self.matrix[i - 1, j] + self.gap,
                                                self.matrix[i, j - 1] + self.gap
                                               )

        # Case fixée
        if (self.frozen['seq_id'] == 0):
            self.matrix[self.frozen['seq_id'], self.frozen['pos_id']] = self.matrix[self.frozen['seq_id'], self.frozen['pos_id'] - 1]
        elif (self.frozen['pos_id'] == 0):
            self.matrix[self.frozen['seq_id'], self.frozen['pos_id']] = self.matrix[self.frozen['seq_id'] - 1, self.frozen['pos_id']]
        else :
            self.matrix[self.frozen['seq_id'], self.frozen['pos_id']] = self.matrix[self.frozen['seq_id'] - 1, self.frozen['pos_id'] - 1]

        # Partie inférieure droite (si elle existe)
        if (self.frozen['seq_id'] != self.lines - 1) and (self.frozen['pos_id'] != self.columns - 1):
            self.initialize_matrix(self.matrix[self.frozen['seq_id'], self.frozen['pos_id']] +
                                   self.get_score(self.frozen['seq_id'] + 1, self.frozen['pos_id'] + 1),
                                   [self.frozen['seq_id'] + 1, self.frozen['pos_id'] + 1],
                                   [self.lines - 1, self.columns - 1], self.get_score
                                  )

            if (self.frozen['seq_id'] != self.lines - 2) and (self.frozen['pos_id'] != self.columns - 2):
                for i in range(self.frozen['seq_id'] + 2, self.lines):
                    for j in range(self.frozen['pos_id'] + 2, self.columns):
                        score = self.get_score(i, j)
                        self.matrix[i, j] = score + min(self.matrix[i - 1, j - 1],
                                                        self.matrix[i - 1, j] + self.gap,
                                                        self.matrix[i, j - 1] + self.gap
                                                       )
                        max_score = self.matrix[i, j]
            else :
                max_score = self.matrix[self.lines - 1, self.columns - 1]

        else :
            max_score = self.matrix[self.frozen['seq_id'], self.frozen['pos_id']]
        
        return max_score

## Nouveau code

In [2]:
# Matrice de distance
PDB_FILE = "../data/5awl.pdb"
TEMPLATE = Template(PDB_FILE)
DIST_MATRIX = TEMPLATE.build_dist_matrix()
    
# Matrice DOPE
DOPE_FILE = "../data/dope.par"
DOPE_MATRIX = clean_DOPE_data(DOPE_FILE)
    
FROZEN = {'seq_id': 5, 'pos_id': 5}
SEQUENCE = "YYDPETGTWY"
GAP = 0
    
LOW_TEST = LowLevelMatrix(GAP, FROZEN, DIST_MATRIX, DOPE_MATRIX, SEQUENCE)
MAX_SCORE = LOW_TEST.fill_matrix()
print(MAX_SCORE)

-15.620000000000001


In [90]:
class HighLevelMatrix(DynamicMatrix):
    def __init__(self, gap, distance, dope, sequence):
        lines = len(sequence)
        columns = len(distance)

        DynamicMatrix.__init__(self, lines, columns, gap)

        self.distance = distance
        self.dope = dope
        self.sequence = sequence

        self.get_score_matrix()

    def get_score_matrix(self):
        self.score_matrix = np.zeros((self.lines, self.columns))
        for i in range(self.lines):
            for j in range(self.columns):
                frozen = {'seq_id': i, 'pos_id': j}
                low_level = LowLevelMatrix(self.gap, frozen, self.distance, self.dope, self.sequence)
                self.score_matrix[i, j] =  low_level.fill_matrix()

    def get_score(self, i, j):
        score = self.score_matrix[i, j]
        return score

    def fill_matrix(self):
        # Initialisation
        self.initialize_matrix(self.get_score(0, 0), [0, 0], 
                               [self.lines - 1, self.columns - 1],
                               self.get_score)

        # Remplissage
        for i in range(1, self.lines):
            for j in range(1, self.columns):
                score = self.get_score(i, j)
                self.matrix[i, j] = score + min(self.matrix[i - 1, j - 1],
                                                self.matrix[i - 1, j] + self.gap,
                                                self.matrix[i, j - 1] + self.gap
                                               )
        max_score = self.matrix[self.lines - 1, self.columns - 1]

        return max_score

    def get_alignment(self):
        structure_align = []
        sequence_align = []
        
        i = self.lines - 1
        j = self.columns - 1
        while not ((i == 0) and (j == 0)):
            print(i, j)
            square = self.matrix[i, j]
            score = self.score_matrix[i, j]
            # Match
            if (square == self.matrix[i - 1, j - 1] + score):
                print("match")
                structure_align.insert(0, j + 1)
                sequence_align.insert(0, self.sequence[i])
                i = i - 1
                j = j - 1
            # Gap
            else:
                if (square == self.matrix[i - 1, j] + score + self.gap):
                    print("gap structure")
                    structure_align.insert(0, '-')
                    sequence_align.insert(0, self.sequence[i])
                    i = i - 1
                elif (square == self.matrix[i, j - 1] + score + self.gap):
                    print("gap sequence")
                    structure_align.insert(0, j + 1)
                    sequence_align.insert(0, '-')
                    j = j - 1

        return ''.join(sequence_align), ''.join(str(x) for x in structure_align)

In [91]:
# Matrice de distance
PDB_FILE = "../data/5awl.pdb"
TEMPLATE = Template(PDB_FILE)
DIST_MATRIX = TEMPLATE.build_dist_matrix()

# Matrice DOPE
DOPE_FILE = "../data/dope.par"
DOPE_MATRIX = clean_DOPE_data(DOPE_FILE)
    
FROZEN = {'seq_id': 5, 'pos_id': 5}
SEQUENCE = "YYDPETGTWY"
GAP = 0

HIGH_TEST = HighLevelMatrix(GAP, DIST_MATRIX, DOPE_MATRIX, SEQUENCE)
print(HIGH_TEST.score_matrix)

[[-14.84 -16.55 -17.36 -16.48 -15.89 -16.71 -17.42 -16.63 -16.08  -2.54]
 [ -3.43 -15.14 -15.95 -15.07 -14.48 -15.3  -16.01 -15.22 -14.67  -2.54]
 [  7.45 -15.41 -16.61 -16.05 -16.16 -16.4  -16.92 -16.29 -15.    -3.74]
 [ 18.69 -16.11 -16.96 -16.74 -16.39 -16.69 -17.04 -16.4  -15.84  -6.61]
 [ 31.46 -15.76 -16.4  -15.63 -15.87 -16.1  -16.59 -16.04 -14.94  -6.93]
 [ 42.89 -15.24 -16.1  -15.54 -15.29 -15.62 -16.33 -15.49 -14.82  -9.05]
 [ 54.67 -15.45 -16.06 -15.27 -15.   -15.47 -16.12 -15.57 -14.87 -10.19]
 [ 66.11 -15.24 -16.1  -15.54 -15.29 -15.62 -16.33 -15.49 -14.82 -12.27]
 [ 77.49 -14.6  -15.31 -14.72 -14.58 -14.8  -15.47 -14.75 -14.17 -12.97]
 [ 90.   -13.81 -13.85 -14.07 -14.29 -14.5  -15.74 -15.18 -14.67 -14.94]]


In [92]:
print(HIGH_TEST.matrix)
max_score = HIGH_TEST.fill_matrix()
print(HIGH_TEST.matrix)
print(max_score)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[ -14.84  -31.39  -48.75  -65.23  -81.12  -97.83 -115.25 -131.88 -147.96
  -150.5 ]
 [ -18.27  -46.53  -64.7   -80.3   -95.6  -113.13 -131.26 -147.1  -162.63
  -165.17]
 [ -10.82  -61.94  -81.31  -97.36 -113.52 -129.92 -148.18 -164.47 -179.47
  -183.21]
 [   7.87  -78.05  -98.27 -115.01 -131.4  -148.09 -165.22 -181.62 -197.46
  -204.07]
 [  39.33  -93.81 -114.67 -130.64 -147.27 -164.19 -181.81 -197.85 -212.79
  -219.72]
 [  82.22 -109.05 -130.77 -146.31 -162.56 -179.81 -198.14 -213.63 -228.45
  -237.5 ]
 [ 136.89 -124.5  -146.83 -162.1  -177.56 -195.28 -214.26 -229.83 -244.7
  -254.89]
 [ 203.   -139.74 -162.93 -178.47 -193.76 -210.9  -230.59 -246.08 -260.9
  

In [93]:
print(HIGH_TEST.matrix[9, 9])
print(HIGH_TEST.matrix[8, 8])
print(HIGH_TEST.score_matrix[9, 9])
print(HIGH_TEST.matrix[9, 8] + HIGH_TEST.score_matrix[9, 9])

-306.59000000000003
-275.07000000000005
-14.940000000000001
-306.59000000000003


In [94]:
seq, struct = HIGH_TEST.get_alignment()
print(seq, struct)

9 9
gap sequence
9 8
gap sequence
9 7
gap sequence
9 6
gap structure
8 6
gap structure
7 6
gap structure
6 6
gap structure
5 6
gap structure
4 6
gap structure
3 6
gap structure
2 6
gap structure
1 6
gap structure
0 6
gap sequence
0 5
gap sequence
0 4
gap sequence
0 3
gap sequence
0 2
gap sequence
0 1
gap sequence
Y------YDPETGTWY--- 1234567---------8910


In [95]:
print(seq)
print(struct)

Y------YDPETGTWY---
1234567---------8910


In [96]:
print(HIGH_TEST.sequence)

YYDPETGTWY
