In [1]:
import pandas as pd

# Define the path to the fingerprints file
fingerprints_file = 'chembl_34_fps\chembl_34.fps'

# Function to parse each line correctly
def parse_fingerprint_line(line):
    parts = line.strip().split(maxsplit=1)  # Split only at the first space
    if len(parts) == 2:
        chembl_id = parts[0]
        fingerprint = parts[1]
    else:
        chembl_id = parts[0]
        fingerprint = None
    return chembl_id, fingerprint

# Read and parse the file manually
with open(fingerprints_file, 'r') as file:
    lines = file.readlines()

# Skip the metadata/comment lines (first 5 lines)
data_lines = lines[6:]

# Parse the data lines
data = [parse_fingerprint_line(line) for line in data_lines if line.strip()]  # Ignore empty lines
fingerprints_df = pd.DataFrame(data, columns=['Fingerprint', 'ChEMBL_ID'])

print(fingerprints_df.head())


                                         Fingerprint     ChEMBL_ID
0  0008040000000000000000000000000000100000000000...  CHEMBL153534
1  0208000000020000008003000000300000108100002000...  CHEMBL440060
2  0200000000800000008003002000280000108900800000...  CHEMBL440245
3  0200000800820000080001002000280000500000002000...  CHEMBL440249
4  0000000000000006000001000000000000000000000004...  CHEMBL405398


In [2]:
# Check for missing values
print(fingerprints_df.isnull().sum())

# Display a summary of the DataFrame
print(fingerprints_df.info())

# Display the first few rows of the DataFrame
fingerprints_df['Fingerprint'][0:5]

Fingerprint    0
ChEMBL_ID      0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409270 entries, 0 to 2409269
Data columns (total 2 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   Fingerprint  object
 1   ChEMBL_ID    object
dtypes: object(2)
memory usage: 36.8+ MB
None


0    0008040000000000000000000000000000100000000000...
1    0208000000020000008003000000300000108100002000...
2    0200000000800000008003002000280000108900800000...
3    0200000800820000080001002000280000500000002000...
4    0000000000000006000001000000000000000000000004...
Name: Fingerprint, dtype: object

In [3]:
# Function to convert hexadecimal fingerprint to binary
def hex_to_binary(hex_str):
    return bin(int(hex_str, 16))[2:].zfill(len(hex_str) * 4)

# Function to convert binary fingerprint to a list of integers
def binary_to_list(binary_str):
    return list(map(int, binary_str))

# Apply the conversion functions to the fingerprint column
fingerprints_df['Fingerprint'] = fingerprints_df['Fingerprint'].apply(lambda x: binary_to_list(hex_to_binary(x)))

# Verify the conversion
print(fingerprints_df.head())


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Convert the list of integers into a 2D array (matrix)
fingerprints_matrix = fingerprints_df['Fingerprint'].apply(pd.Series)

# Prepare the feature matrix and labels (if you have labels, e.g., IC50 values)
# For this example, we'll use the ChEMBL_IDs as dummy labels
# Replace this with your actual labels for a real QSAR model
labels = LabelEncoder().fit_transform(fingerprints_df['ChEMBL_ID'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(fingerprints_matrix, labels, test_size=0.2, random_state=42)


ValueError: invalid literal for int() with base 10: 'b'