# Dataset
1. E3-substrates-interactions file is downloaded from : http://ubibrowser.bio-it.cn/ubibrowser_v3/home/download
2. Collected_degrons file is downloaded from: http://degron.phasep.pro/download/ 

In [1]:
import pandas as pd

In [4]:
'''Convert E3-substrates-interactions text file to Excel file'''
# read in the text file using pandas
df = pd.read_csv('../datasets/E3-substrates-interactions.txt', delimiter='\t')

# write the dataframe to an Excel file
df.to_excel('../datasets/E3-substrates-interactions.xlsx', index=False)

# Preprocessing
1. Read the datasets
2. Remove the rows with nan or invalid IDs on Entry column in degrons, and unreviewd on their Status column
3. Remove the rows with nan or invalid IDs on both SwissProt AC (Substrate) and SwissProt AC (E3) columns
4. Merge them together on Entry to a final dataset
5. Keep needed columns: SwissProt AC (Substrate), SwissProt AC (E3), protein seq, start, end, deg_seq, E3_sequence
6. Explode the rows containing # in their E3
7. Add E3 sequences to the dataframe
8. Groupby the final dataset to merge rows with the same substrate 

In [5]:
# Read in the xlsx files using pandas
degrons = pd.read_excel('../datasets/collected_degrons.xlsx')
interactions = pd.read_excel('../datasets/E3-substrates-interactions.xlsx')
E3_sequences = pd.read_excel('../datasets/uniprot-E3.xlsx')

# Remove rows from both dataframes that have NaN or invalid ids
degrons = degrons[(degrons['Entry'].notnull()) & (degrons['Entry'] != '-')]
degrons = degrons[degrons['Status'] != 'unreviewd']

# Remove the rows with nan or invalid IDs on both SwissProt AC (Substrate) and SwissProt AC (E3) columns
interactions = interactions[(interactions['SwissProt AC (Substrate)'].notnull()) & (interactions['SwissProt AC (Substrate)'] != '-')]
interactions = interactions[(interactions['SwissProt AC (E3)'].notnull()) & (interactions['SwissProt AC (E3)'] != '-')]

# Merge them together on Entry to a final dataset
final = pd.merge(degrons, interactions, left_on='Entry', right_on='SwissProt AC (Substrate)')
final = pd.merge(final, E3_sequences, left_on='SwissProt AC (E3)', right_on='Entry')

# Keep needed columns: SwissProt AC (Substrate), SwissProt AC (E3), protein seq, start, end, deg_seq
final = final[['SwissProt AC (Substrate)', 'SwissProt AC (E3)', 'protein seq', 'start', 'end', 'deg_seq', 'E3_sequence']]
final.columns = ['substrate', 'E3', 'substrate protein seq', 'start', 'end', 'degron_seq', 'E3_sequence']
final = final.assign(E3=final['E3'].str.split('#')).explode('E3')
final = final.drop_duplicates(subset=['substrate', 'E3'], keep='first')

# Creat a map to store degrons and their corresponding substrates
degron_substrate = {}
for index, row in final.iterrows():
    if row['degron_seq'] not in degron_substrate:
        degron_substrate[row['degron_seq']] = row['substrate']
        
#Save the degron_substrate dictionary to a csv file
with open('../datasets/degron_substrate.csv', 'w') as f:
    for key in degron_substrate.keys():
        f.write("%s,%s\n"%(key,degron_substrate[key]))

# Creat a dictionary to store the E3s and their corresponding E3 sequences
E3_sequence = {}
for index, row in final.iterrows():
    if row['E3'] not in E3_sequence:
        E3_sequence[row['E3']] = row['E3_sequence']

# Save the E3_sequence dictionary to a csv file
with open('../datasets/E3_sequence.csv', 'w') as f:
    for key in E3_sequence.keys():
        f.write("%s,%s\n"%(key,E3_sequence[key]))

# Groupby the final dataset to merge rows with the same substrate 
final = final.groupby(['substrate', 'substrate protein seq', 'start', 'end', 'degron_seq']).agg({'E3': lambda x: ';'.join(x), 'E3_sequence': lambda x: ';'.join(x)}).reset_index()

# Write the final dataset to an Excel file
final.to_excel('../datasets/final_dataset.xlsx', index=False)

In [9]:
# Test and train split
from sklearn.model_selection import train_test_split
train, test = train_test_split(final, test_size=0.2, random_state=42)

# Remove substrate protein seq, start, end columns from both train and test datasets
train = train.drop(columns=['substrate protein seq', 'start', 'end'])
test = test.drop(columns=['substrate protein seq', 'start', 'end'])

#Print the size of the train and test datasets
print(train.shape)
print(test.shape)

# Write the train and test datasets to Excel files
train.to_excel('../datasets/train.xlsx', index=False)
test.to_excel('../datasets/test.xlsx', index=False)

# Create a dictionary to store the degrons and their corresponding E3s in the train dataset
train_degron_E3 = {}
for index, row in train.iterrows():
    if row['degron_seq'] not in train_degron_E3:
        train_degron_E3[row['degron_seq']] = row['E3_sequence'].split(';')

# Save the train_degron_E3 dictionary to a csv file
with open('../datasets/train_degron_E3.csv', 'w') as f:
    for key in train_degron_E3.keys():
        f.write("%s,%s\n"%(key,train_degron_E3[key]))

# Create a dictionary to store the degrons and their corresponding E3s in the test dataset
test_degron_E3 = {}
for index, row in test.iterrows():
    if row['degron_seq'] not in test_degron_E3:
        test_degron_E3[row['degron_seq']] = row['E3_sequence'].split(';')

# Save the test_degron_E3 dictionary to a csv file
with open('../datasets/test_degron_E3.csv', 'w') as f:
    for key in test_degron_E3.keys():
        f.write("%s,%s\n"%(key,test_degron_E3[key]))



(112, 4)
(29, 4)


In [10]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Adafactor, Seq2SeqTrainer, Seq2SeqTrainingArguments
import pandas as pd

# Load the training data
train_data = pd.read_csv("train_degron_E3.csv", names=["degron", "E3"])
train_degrons = train_data["degron"].tolist()
train_E3s = train_data["E3"].tolist()

# Load the test data
test_data = pd.read_csv("test_degron_E3.csv", names=["degron", "E3"])
test_degrons = test_data["degron"].tolist()
test_E3s = test_data["E3"].tolist()

# Initialize the tokenizer and the model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Tokenize the training and test data
train_encodings = tokenizer(train_degrons, train_E3s, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(test_degrons, test_E3s, padding=True, truncation=True, return_tensors='pt')

# Create the dataset objects for PyTorch
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_encodings['labels'])
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_encodings['labels'])

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    learning_rate=1e-3,
    weight_decay=0.01,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=None,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./trained_model')

  from .autonotebook import tqdm as notebook_tqdm


: 

: 