In [1]:
import pandas as pd
import h5py

def write_embeddings_to_csv(embeddings_file, output_csv):
    proteins = []
    dimensions = []
    
    with h5py.File(embeddings_file, 'r') as f:
        for new_identifier in f.keys():
            original_id = f[new_identifier].attrs["original_id"]
            embeddings = f[new_identifier][:]
            proteins.append((original_id, *embeddings))
            dimensions.append(embeddings.shape[0])

    # Determine unique dimensions
    unique_dimensions = list(set(dimensions))

    # Ensure that all embeddings have the same dimension
    if len(unique_dimensions) > 1:
        raise ValueError("Embeddings have varying dimensions.")
    
    num_dimensions = unique_dimensions[0]

    # Create column names
    column_names = ["Protein_ID"] + [f"Dim_{i}" for i in range(num_dimensions)]

    # Create a DataFrame
    df = pd.DataFrame(proteins, columns=column_names)

    # Write to CSV
    df.to_csv(output_csv, index=False)

# Usage
write_embeddings_to_csv("reduced_embeddings_file.h5", "reduced_embeddings_file.csv")


In [2]:
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv("reduced_embeddings_file.csv")

# Display the first few rows to verify the data
df.head()


Unnamed: 0,Protein_ID,Dim_0,Dim_1,Dim_2,Dim_3,Dim_4,Dim_5,Dim_6,Dim_7,Dim_8,...,Dim_1014,Dim_1015,Dim_1016,Dim_1017,Dim_1018,Dim_1019,Dim_1020,Dim_1021,Dim_1022,Dim_1023
0,Peptide_814,0.023768,0.029138,-0.053399,-0.01443,-0.376964,-0.385243,0.074914,-0.019825,-0.217285,...,0.225057,-0.043053,-0.006347,-0.023087,-0.015154,0.031613,-0.061779,-0.199246,0.07611,0.104706
1,Peptide_887,-0.170121,-0.170663,-0.028813,-0.121715,-0.425803,-0.504735,0.099278,0.103303,-0.263579,...,0.175611,0.000171,-0.000324,-0.052463,0.020018,0.091868,0.011549,-0.34237,0.106737,0.052733
2,Peptide_732,0.037726,-0.093002,0.083019,-0.150918,-0.111729,-0.451204,0.12267,0.127014,-0.317709,...,0.082577,0.0495,0.037831,-0.009632,-0.113377,-0.018344,-0.08677,0.033339,-0.009066,-0.021657
3,Peptide_3829,-0.014491,-0.079954,-0.046206,-0.069027,-0.021293,-0.020502,0.146769,-0.016016,-0.102209,...,0.063845,0.073178,0.062495,-0.027046,-0.151535,-0.010029,-0.005464,0.006473,0.122164,-0.011399
4,Peptide_4728,0.017241,-0.008655,0.110107,-0.100334,-0.021852,-0.052222,0.190858,0.091145,-0.070164,...,0.035286,0.043913,0.04393,-0.077229,0.016655,-0.019973,-0.109175,-0.011466,0.00519,-0.157366


In [3]:
import pandas as pd

# Read reduced_embeddings_file.csv
df = pd.read_csv('reduced_embeddings_file.csv')

# Custom sorting function
def custom_sort(pid):
    prefix, num = pid.split('_')
    return (prefix, int(num))

# Apply custom sorting to 'Protein_ID' column
df['sort_key'] = df['Protein_ID'].apply(custom_sort)
df_sorted = df.sort_values(by='sort_key').drop('sort_key', axis=1).reset_index(drop=True)

# Write the reordered DataFrame to reduced_embeddings_file_ordered.csv
df_sorted.to_csv('reduced_embeddings_file_ordered.csv', index=False)


In [4]:
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv("reduced_embeddings_file_ordered.csv")

# Display the first few rows to verify the data
df.head()


Unnamed: 0,Protein_ID,Dim_0,Dim_1,Dim_2,Dim_3,Dim_4,Dim_5,Dim_6,Dim_7,Dim_8,...,Dim_1014,Dim_1015,Dim_1016,Dim_1017,Dim_1018,Dim_1019,Dim_1020,Dim_1021,Dim_1022,Dim_1023
0,Peptide_1,0.094093,0.047706,0.045044,-0.041677,0.096842,0.037831,0.036249,0.088294,0.101851,...,0.056638,-0.050206,0.001237,-0.092129,0.106827,-0.074483,0.065306,-0.03997,-0.028538,0.038047
1,Peptide_2,-0.047868,-0.069771,0.036386,-0.128784,-0.020473,0.04653,0.164928,0.043062,-0.09485,...,0.103799,0.037622,0.073234,-0.094746,-0.155867,0.051478,-0.07695,-0.14821,-0.022429,0.057351
2,Peptide_3,0.024743,-0.092542,-0.005405,-0.057481,-0.036577,0.09422,0.124477,0.054345,-0.083779,...,0.054807,-0.02125,0.055869,-0.122675,-0.086572,0.052538,-0.220704,-0.071421,-0.029833,0.032151
3,Peptide_4,-0.071675,-0.014999,0.087363,-0.104275,0.021018,-0.138768,0.167985,0.01361,0.085935,...,0.182361,0.082692,0.109379,-0.177485,-0.181027,-0.056001,-0.178252,-0.229457,0.169394,-0.02022
4,Peptide_5,-0.102082,0.009033,0.110319,-0.112409,-0.035802,-0.144515,0.169197,0.009977,0.093193,...,0.12375,0.061757,0.057145,-0.153632,-0.066486,-0.047409,-0.145986,-0.125834,0.064867,0.017166
