In [None]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from torchvision import datasets
from torchvision.transforms import ToTensor
import torch.nn.functional as F
from torch.utils.data import TensorDataset

In [None]:
with open("./protein-secondary-structure.train", "r") as f:
  data = []
  target = []
  found_first_delim = False
  for line in f:
    if line.startswith("<"):
      found_first_delim = True
      sequences = []
      labels = []
      continue
    elif line.startswith("e"):
      if found_first_delim:
         data.append(sequences)
         target.append(labels)
         
         sequences = []
         labels = []
         found_first_delim = False
      continue
    elif found_first_delim:
      value = line.strip()
      sequences.append(value.split(" ")[0])
      labels.append(value.split(" ")[1])
print(data)       


[['G', 'V', 'G', 'T', 'V', 'P', 'M', 'T', 'D', 'Y', 'G', 'N', 'D', 'V', 'E', 'Y', 'Y', 'G', 'Q', 'V', 'T', 'I', 'G', 'T', 'P', 'G', 'K', 'S', 'F', 'N', 'L', 'N', 'F', 'D', 'T', 'G', 'S', 'S', 'N', 'L', 'W', 'V', 'G', 'S', 'V', 'Q', 'C', 'Q', 'A', 'S', 'G', 'C', 'K', 'G', 'G', 'R', 'D', 'K', 'F', 'N', 'P', 'S', 'D', 'G', 'S', 'T', 'F', 'K', 'A', 'T', 'G', 'Y', 'D', 'A', 'S', 'I', 'G', 'Y', 'G', 'D', 'G', 'S', 'A', 'S', 'G', 'V', 'L', 'G', 'Y', 'D', 'T', 'V', 'Q', 'V', 'G', 'G', 'I', 'D', 'V', 'T', 'G', 'G', 'P', 'Q', 'I', 'Q', 'L', 'A', 'Q', 'R', 'L', 'G', 'G', 'G', 'G', 'F', 'P', 'G', 'D', 'N', 'D', 'G', 'L', 'L', 'G', 'L', 'G', 'F', 'D', 'T', 'L', 'S', 'I', 'T', 'P', 'Q', 'S', 'S', 'T', 'N', 'A', 'F', 'D', 'Q', 'V', 'S', 'A', 'Q', 'G', 'K', 'V', 'I', 'Q', 'P', 'V', 'F', 'V', 'V', 'Y', 'L', 'A', 'A', 'S', 'N', 'I', 'S', 'D', 'G', 'D', 'F', 'T', 'M', 'P', 'G', 'W', 'I', 'D', 'N', 'K', 'Y', 'G', 'G', 'T', 'L', 'L', 'N', 'T', 'N', 'I', 'D', 'A', 'G', 'E', 'G', 'Y', 'W', 'A', 'L', 'N', 'V'

In [None]:
# construction des data en prenant en compte une fenetre
data_with_window = []
window_size = 3
for seq in data:

  padded_seq = [''] * window_size + seq + [''] * window_size
  for i in range(window_size, len(seq) + window_size):
    # Extraire la fenêtre glissante
    window = padded_seq[i-window_size:i+window_size+1]
    data_with_window.append(window)

print(len(data_with_window))
data_with_window

targets = []
for i in range(len(target)):
  for j in range(len(target[i])):
    targets.append(target[i][j])
len(targets)

15104


15104

In [None]:
# recherche des acides uniques pour pouvoir les encoder
uniq_labels = []
for i in range(len(data)):
  for j in range(len(data[i])):
    if data[i][j] not in uniq_labels:
      uniq_labels.append( data[i][j])
# encodage en label
def get_target_dict():
  target_dict = {}
  target_dict[''] = 0
  x = 1
  for label in uniq_labels:
      target_dict[label] = x
      x += 1
  return target_dict
acides = get_target_dict()
print(acides)

{'': 0, 'G': 1, 'V': 2, 'T': 3, 'P': 4, 'M': 5, 'D': 6, 'Y': 7, 'N': 8, 'E': 9, 'Q': 10, 'I': 11, 'K': 12, 'S': 13, 'F': 14, 'L': 15, 'W': 16, 'C': 17, 'A': 18, 'R': 19, 'H': 20}


In [None]:
data_with_window

[['', '', '', 'G', 'V', 'G', 'T'],
 ['', '', 'G', 'V', 'G', 'T', 'V'],
 ['', 'G', 'V', 'G', 'T', 'V', 'P'],
 ['G', 'V', 'G', 'T', 'V', 'P', 'M'],
 ['V', 'G', 'T', 'V', 'P', 'M', 'T'],
 ['G', 'T', 'V', 'P', 'M', 'T', 'D'],
 ['T', 'V', 'P', 'M', 'T', 'D', 'Y'],
 ['V', 'P', 'M', 'T', 'D', 'Y', 'G'],
 ['P', 'M', 'T', 'D', 'Y', 'G', 'N'],
 ['M', 'T', 'D', 'Y', 'G', 'N', 'D'],
 ['T', 'D', 'Y', 'G', 'N', 'D', 'V'],
 ['D', 'Y', 'G', 'N', 'D', 'V', 'E'],
 ['Y', 'G', 'N', 'D', 'V', 'E', 'Y'],
 ['G', 'N', 'D', 'V', 'E', 'Y', 'Y'],
 ['N', 'D', 'V', 'E', 'Y', 'Y', 'G'],
 ['D', 'V', 'E', 'Y', 'Y', 'G', 'Q'],
 ['V', 'E', 'Y', 'Y', 'G', 'Q', 'V'],
 ['E', 'Y', 'Y', 'G', 'Q', 'V', 'T'],
 ['Y', 'Y', 'G', 'Q', 'V', 'T', 'I'],
 ['Y', 'G', 'Q', 'V', 'T', 'I', 'G'],
 ['G', 'Q', 'V', 'T', 'I', 'G', 'T'],
 ['Q', 'V', 'T', 'I', 'G', 'T', 'P'],
 ['V', 'T', 'I', 'G', 'T', 'P', 'G'],
 ['T', 'I', 'G', 'T', 'P', 'G', 'K'],
 ['I', 'G', 'T', 'P', 'G', 'K', 'S'],
 ['G', 'T', 'P', 'G', 'K', 'S', 'F'],
 ['T', 'P', 'G', '

In [None]:
# Initialisation de la matrice d'encodage one-hot
train_one_hot = torch.zeros(len(data_with_window), len(data_with_window[0]), len(acides))
train_without_ON = []
# Boucle pour chaque sequence et chaque acide aminé de la fenetre
for i, seq in enumerate(data_with_window):
    for j, aa in enumerate(seq):
        # On récupère l'index de l'acide aminé dans le dictionnaire
        aa_index = acides.get(aa, None)
        if aa_index is not None:
            # On encode l'acide aminé en one-hot
            train_one_hot[i, j, aa_index] = 1

            
# for i, seq in enumerate(data_with_window):
#     sequence=[]
#     for j, aa in enumerate(seq):
#         # On récupère l'index de l'acide aminé dans le dictionnaire
#         aa_index = acides.get(aa, None)
#         sequence.append(aa_index)       
#     train_without_ON.append(sequence)

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# train_without_ON = np.array(scaler.fit_transform(train_without_ON),dtype=np.float32)
# train_without_ON = torch.from_numpy(train_without_ON)


def labelencoder(y):
  encoded_targets = []
  for target in y:
    if target == '_':
      encoded_targets.append(0)
    elif target == 'e':
      encoded_targets.append(1)
    else:
      encoded_targets.append(2)
  integer_encoded = torch.tensor(encoded_targets)  
  return integer_encoded

print(targets)
print(labelencoder(targets))
y_train = labelencoder(targets)
print(train_one_hot.shape)

# convertissons les target en one Hot
# déterminer le nombre de classes
num_classes = len(set(targets))
#encoder les cibles en one-hot
# y_train = torch.nn.functional.one_hot(y_train, num_classes=num_classes)

print(y_train.shape)

['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', '_', '_', '_', '_', '_', '_', 'e', 'e', '_', '_', 'e', 'e', 'e', '_', '_', '_', '_', '_', '_', 'e', 'e', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', 'e', 'e', 'e', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', 'e', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'h', 'h', 'h', 'h', 'h', 'h', 'h', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', 'e', '_', '_', '_', '_', '_', 'e', 'e', 'e', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'e', 'e', '_',

In [None]:
def getDataset():
  return TensorDataset(train_one_hot, y_train )
  

In [None]:
train_dataset = getDataset()
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7f944cf463a0>

In [None]:
from torch.utils.data import Subset, random_split
dataset_size = len(train_dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

# Diviser les données en deux parties: un ensemble d'apprentissage et un ensemble de validation
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])


In [None]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=64)

for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape} {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 7, 21]) torch.float32
Shape of y: torch.Size([64]) torch.int64


# MLP

In [None]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(7*21, 128),
            #nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            #nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(256, 3)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cuda device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=147, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): ReLU()
    (5): Linear(in_features=128, out_features=256, bias=True)
    (6): ReLU()
    (7): Linear(in_features=256, out_features=3, bias=True)
  )
)


In [None]:
import torch.optim as optim 
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
num_epochs = 150
loss_history = []
accuracy_history = []
for epoch in range(num_epochs):
    for i, (X, y) in enumerate(train_dataloader):
        
        X, y = X.to(device), y.to(device)
        
        # Forward pass
        outputs = model(X)
        loss = loss_fn(outputs, y)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 100 == 0:
           print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                  epoch, i * len(X), len(train_dataloader.dataset),
                  100. * i / len(train_dataloader), loss.item()))
    with torch.no_grad():
        correct = 0
        total = 0
        for X_val, y_val in val_dataloader:
            # Évaluation du modèle sur les données de validation
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs = model(X_val)
            _, predicted = torch.max(outputs.data, 1)
            total += y_val.size(0)
            correct += (predicted == y_val).sum().item()
        #loss_history.append(dev_loss)
        accuracy = 100 * correct / total
        accuracy_history.append(accuracy)
        print('Epoch [{}/{}], Accuracy: {:.2f}%'
              .format(epoch+1, num_epochs, accuracy))

Epoch [1/150], Accuracy: 55.58%
Epoch [2/150], Accuracy: 55.58%
Epoch [3/150], Accuracy: 55.58%
Epoch [4/150], Accuracy: 55.58%
Epoch [5/150], Accuracy: 55.58%
Epoch [6/150], Accuracy: 55.58%
Epoch [7/150], Accuracy: 55.58%
Epoch [8/150], Accuracy: 55.58%
Epoch [9/150], Accuracy: 55.58%
Epoch [10/150], Accuracy: 55.58%
Epoch [11/150], Accuracy: 55.58%
Epoch [12/150], Accuracy: 55.58%
Epoch [13/150], Accuracy: 55.58%
Epoch [14/150], Accuracy: 55.58%
Epoch [15/150], Accuracy: 55.58%
Epoch [16/150], Accuracy: 55.58%
Epoch [17/150], Accuracy: 55.58%
Epoch [18/150], Accuracy: 55.58%
Epoch [19/150], Accuracy: 55.58%
Epoch [20/150], Accuracy: 55.58%
Epoch [21/150], Accuracy: 55.58%
Epoch [22/150], Accuracy: 55.58%
Epoch [23/150], Accuracy: 55.58%
Epoch [24/150], Accuracy: 55.58%
Epoch [25/150], Accuracy: 55.58%
Epoch [26/150], Accuracy: 55.58%
Epoch [27/150], Accuracy: 55.58%
Epoch [28/150], Accuracy: 55.58%
Epoch [29/150], Accuracy: 55.58%
Epoch [30/150], Accuracy: 55.58%
Epoch [31/150], Acc

KeyboardInterrupt: ignored

In [None]:
import matplotlib.pyplot as plt


xlabel = [ i for i in range(epoch+1)]

plt.plot(xlabel, accuracy_history, c='b')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("accuracy du modèle en fonction du nombre d'epoch ( Couche 5 )")