# ECFP+ Feature Generation

This notebook takes the ECFP representation and generates the ECFP+ dataset.

In [None]:
import pandas as pd
import os

# Define the paths to the datasets
ecfp_path = "../data/interim/OpenCycloDB/ECFP/fingerprints.csv"
temp_path = "../data/raw/OpenCycloDB/Data/CDEnrichedData.csv"
output_path = "../data/processed/OpenCycloDB/ECFPplus/ecfp_plus.csv"

# Load the datasets
ecfp_df = pd.read_csv(ecfp_path)
temp_df = pd.read_csv(temp_path)

# Ensure 'ph' and 'temp' columns are correctly identified
temp_df = temp_df[["pH", "T"]]

# Concatenate the datasets
combined_df = pd.concat([temp_df, ecfp_df], axis=1)

# Define the output directory
output_dir = "../data/interim/OpenCycloDB/ECFP_plus"

# Check if the directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the train, validation, and test features and labels to separate CSV files
combined_df.to_csv(f"{output_dir}/ecfp_plus.csv", index=False)

In [8]:
import pandas as pd

# Define the paths to the datasets
ecfp_path = "../data/interim/OpenCycloDB/ECFP/fingerprints.csv"
temp_path = "../data/raw/OpenCycloDB/Data/CDEnrichedData.csv"
output_path = "../data/processed/OpenCycloDB/ECFPplus/ecfp_plus.csv"

# Load the datasets
ecfp_df = pd.read_csv(ecfp_path)
temp_df = pd.read_csv(temp_path)

# Ensure 'ph' and 'temp' columns are correctly identified
temp_df = temp_df[["DeltaG", "Host", "pH", "T"]]

# Concatenate the datasets
combined_df = pd.concat([temp_df, ecfp_df], axis=1)

In [13]:
from sklearn.model_selection import train_test_split

# Extract the host type from the 'Host' column
combined_df["Host_Type"] = combined_df["Host"].apply(
    lambda x: "alpha" if "alpha" in x else ("beta" if "beta" in x else "gamma")
)

# Separate features and labels
X = combined_df.drop(columns=["DeltaG"])
y = combined_df["DeltaG"]

# Perform the first split to get the train set and a temporary set (val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=X["Host_Type"], random_state=42
)

# Perform the second split on the temporary set to get the val and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=X_temp["Host_Type"], random_state=42
)


# Function to print proportions with actual counts
def print_proportions(df, set_name):
    counts = df["Host_Type"].value_counts()
    proportions = df["Host_Type"].value_counts(normalize=True)
    print(f"{set_name} set proportions:")
    for host_type in proportions.index:
        print(f"{host_type}: {proportions[host_type]:.2f} ({counts[host_type]})")
    print()


# Print the proportions of each host type in the original, train, val, and test sets
print_proportions(combined_df, "Original")
print_proportions(X_train, "Train")
print_proportions(X_val, "Validation")
print_proportions(X_test, "Test")

Original set proportions:
beta: 0.57 (1988)
alpha: 0.33 (1128)
gamma: 0.10 (343)

Train set proportions:
beta: 0.57 (1590)
alpha: 0.33 (902)
gamma: 0.10 (275)

Validation set proportions:
beta: 0.58 (199)
alpha: 0.33 (113)
gamma: 0.10 (34)

Test set proportions:
beta: 0.58 (199)
alpha: 0.33 (113)
gamma: 0.10 (34)



In [15]:
# Remove the 'Host_Type' column from the dataframes
train_df = X_train.drop(columns=["Host_Type", "Host"])
val_df = X_val.drop(columns=["Host_Type", "Host"])
test_df = X_test.drop(columns=["Host_Type", "Host"])

# Display the first few rows of the updated dataframes to verify the changes
print(train_df.head())
print(val_df.head())
print(test_df.head())

       pH      T  Guest_0  Guest_1  Guest_2  Guest_3  Guest_4  Guest_5  \
3171  7.0  298.0        0        0        0        0        0        0   
2945  7.0  298.0        0        0        0        0        0        0   
3203  5.5  298.0        0        0        0        0        0        0   
1437  7.0  298.0        0        0        0        0        0        0   
1993  6.9  298.0        0        2        0        0        0        0   

      Guest_6  Guest_7  ...  Host_1014  Host_1015  Host_1016  Host_1017  \
3171        0        0  ...          0          0          0          0   
2945        0        0  ...          0          0          0          0   
3203        0        0  ...          0          0          0          0   
1437        0        0  ...          0          0          0          0   
1993        0        0  ...          0          0          0          0   

      Host_1018  Host_1019  Host_1020  Host_1021  Host_1022  Host_1023  
3171          0         35     

In [17]:
import os

# Define the output directory
output_dir = "../data/processed/OpenCycloDB/ECFPplus"

# Check if the directory exists, if not, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the train, validation, and test features and labels to separate CSV files
train_df.to_csv(f"{output_dir}/train_features.csv", index=False)
val_df.to_csv(f"{output_dir}/val_features.csv", index=False)
test_df.to_csv(f"{output_dir}/test_features.csv", index=False)

y_train.to_csv(f"{output_dir}/train_labels.csv", index=False)
y_val.to_csv(f"{output_dir}/val_labels.csv", index=False)
y_test.to_csv(f"{output_dir}/test_labels.csv", index=False)