In [1]:
import pandas as pd

In [2]:
# Get the embeddings
embeddings = pd.read_csv("data/OsmoticStress_with_binary_positions_and_embeddings.csv", index_col=0)
embeddings = embeddings.drop(columns=["full_sequence"])

# Get the dataset
df = pd.read_csv('data/OsmoticStress_with_binary_positions.csv')
df.drop(columns=['full_sequence', 'Peptide_sequence'], inplace=True)
df.set_index('Uniprot_ID', inplace=True)

# Merge the two and get the final dataset
dataset = df.merge(embeddings, left_index=True, right_index=True)

In [12]:
dataset.head()

Unnamed: 0_level_0,Log2FC(LiP_norm),Binary_Positions,full_embedding
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A5Z2X5,-0.153604,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[8848178386688232, 7881519943475723, 428928285..."
A5Z2X5,-1.869623,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[8848178386688232, 7881519943475723, 428928285..."
D6VTK4,-0.008114,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[47565970569849014, 7273535244166851, 22035147..."
O13297,-0.374875,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[21353999618440866, 15769580379128456, 4766600..."
O13297,-0.207803,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[21353999618440866, 15769580379128456, 4766600..."


In [4]:
import re

def convert_to_list(number_string):
    cleaned_string = re.sub(r"[^\d,\s]", "", number_string)
    return list(map(int, cleaned_string.split(',')))

dataset['full_embedding'] = dataset['full_embedding'].apply(convert_to_list)
dataset['Binary_Positions'] = dataset['Binary_Positions'].apply(convert_to_list)

In [10]:
import torch
import torch.nn as nn
import numpy as np

embedding_array = np.array(dataset['full_embedding'].tolist(), dtype=np.float32)
full_embedding = torch.tensor(embedding_array)

position_embedding_dim = len(dataset.iloc[0, 2]) 
num_positions = 20
position_embedding = nn.Embedding(num_embeddings=num_positions, embedding_dim=position_embedding_dim)

position_indices = torch.tensor([x[0] for x in dataset['Binary_Positions']])

position_embed = position_embedding(position_indices)

combined_embeddings = position_embed + full_embedding

combined_embeddings_list = combined_embeddings.tolist()

In [11]:
# Create a new DataFrame using the original DataFrame's index
X = pd.DataFrame(combined_embeddings_list, index=dataset.index)

In [15]:
X.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
Uniprot_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A5Z2X5,8848179000000000.0,7881520000000000.0,4.289283e+16,1.710427e+16,7472561000000000.0,1246796000000000.0,2859221000000000.0,464781000000000.0,33868210000000.0,7752636000000000.0,...,5445937000000000.0,5.214862e+16,5950918000000000.0,2992804000000000.0,2.186712e+16,2.490647e+16,9536843000000000.0,2.424401e+16,3110352000000000.0,3304118000000000.0
A5Z2X5,8848179000000000.0,7881520000000000.0,4.289283e+16,1.710427e+16,7472561000000000.0,1246796000000000.0,2859221000000000.0,464781000000000.0,33868210000000.0,7752636000000000.0,...,5445937000000000.0,5.214862e+16,5950918000000000.0,2992804000000000.0,2.186712e+16,2.490647e+16,9536843000000000.0,2.424401e+16,3110352000000000.0,3304118000000000.0
D6VTK4,4.756597e+16,7273535000000000.0,220351500000000.0,660804900000000.0,3326045000000000.0,7376782000000000.0,1.223982e+16,1.203883e+16,1.552944e+16,5348460000000000.0,...,5309712000000000.0,6676825000000000.0,7294690000000000.0,5.301362e+16,2983132000000000.0,3740367000000000.0,1.510385e+16,1.461948e+16,4.779517e+16,697859400000000.0
O13297,2.1354e+16,1.576958e+16,4.7666e+16,1.534467e+16,2267701000000000.0,5980504000000000.0,3956593000000000.0,9986967000000000.0,2456207000000000.0,3.738843e+16,...,9076984000000000.0,8298277000000000.0,4864088000000000.0,2892880000000000.0,2022724000000000.0,3994240000000000.0,6644006000000000.0,1.113248e+16,3.134675e+16,725402900000000.0
O13297,2.1354e+16,1.576958e+16,4.7666e+16,1.534467e+16,2267701000000000.0,5980504000000000.0,3956593000000000.0,9986967000000000.0,2456207000000000.0,3.738843e+16,...,9076984000000000.0,8298277000000000.0,4864088000000000.0,2892880000000000.0,2022724000000000.0,3994240000000000.0,6644006000000000.0,1.113248e+16,3.134675e+16,725402900000000.0


In [14]:
y = dataset['Log2FC(LiP_norm)']

In [16]:
y.head()

Uniprot_ID
A5Z2X5   -0.153604
A5Z2X5   -1.869623
D6VTK4   -0.008114
O13297   -0.374875
O13297   -0.207803
Name: Log2FC(LiP_norm), dtype: float64